#full_data <- readRDS('../data/full_data_20230930.rds')
hourly_full <- readRDS('../data/hourly_full_20230930.rds')
daily_full <- readRDS('../data/daily_full_20230930.rds')
daily_full <- daily_full %>%
filter(!is.na(H2S_daily_avg))
hourly_full <- hourly_full %>%
filter(!is.na(H2S_hourly_avg))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4850485 259.1 8354324 446.2 NA 7392759 394.9
## Vcells 19275994 147.1 37130200 283.3 24576 31967808 243.9
# full_data <- full_data %>%
# filter(!is.na(H2S)) %>%
# select(-starts_with('daily_'), -starts_with('H2S_daily'),
# -all_of(c('Ammonia', 'Benzene', 'Black Carbon', 'DST', 'utm_x', 'utm_y',
# 'county')))
# gc()
monitor_names <- c("ElSegundo" = "El Segundo",
"StAnthony" = "St. Anthony",
"Manhattan" = "Manhattan",
"WestHS" = "West HS",
"ElmAve" = "Elm Ave",
"NorthHS" = "North HS",
"GuenserPark" = "Guenser Park",
"Chico" = "213th & Chico",
"Judson" = "Judson",
"HarborPark" = "Harbor Park",
"FirstMethodist" = "First Methodist",
"GStreet" = "G Street",
"StLuke" = "St. Luke",
"Hudson" = "Hudson",
"InnerPort" = "Inner Port")
base_monitor_stat <- daily_full %>%
group_by(Monitor) %>%
summarise('Start Date' = strftime(min(day), '%Y-%m-%d'),
'End Date' = strftime(max(day), '%Y-%m-%d'),
'Closest Refinery' = unique(closest_ref),
'Distance to Nearest Refinery (m)' = round(unique(dist_ref)),
'Angle to Refinery' = unique(angle_ref),
'Distance to Nearest WRP (m)' = round(unique(dist_wrp)),
'Capacity of Nearest WRP' = unique(closest_wrp_capacity),
'Angle to WRP' = round(unique(angle_wrp)),
'Distance to Dominguez Channel (m)' = round(unique(dist_dc)),
'Elevation' = unique(elevation),
'Enhanced Vegetation Index' = unique(EVI)) %>%
mutate(`Closest Refinery` = case_when(`Closest Refinery` == "Phillips 66 (Wilmington)" ~ "Phillips 66",
`Closest Refinery` == "Torrance Refinery" ~ "Torrance",
`Closest Refinery` == "Valero Refinery" ~ "Valero",
`Closest Refinery` == "Marathon (Carson)" ~ "Marathon Carson",
`Closest Refinery` == "Marathon (Wilmington)" ~ "Marathon Wilmington",
.default = `Closest Refinery`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(base_monitor_stat, digits = 2, format = 'html')
| Monitor | Start Date | End Date | Closest Refinery | Distance to Nearest Refinery (m) | Angle to Refinery | Distance to Nearest WRP (m) | Capacity of Nearest WRP | Angle to WRP | Distance to Dominguez Channel (m) | Elevation | Enhanced Vegetation Index |
|---|---|---|---|---|---|---|---|---|---|---|---|
| El Segundo | 2020-01-01 | 2020-08-13 | Chevron El Segundo | 1174 | 190.87 | 2810 | 850 | 294 | 6437 | 60 | 0.17 |
| St. Anthony | 2020-04-23 | 2023-09-30 | Chevron El Segundo | 970 | 188.57 | 2825 | 850 | 298 | 6543 | 44 | 0.17 |
| Manhattan | 2020-03-24 | 2023-09-30 | Chevron El Segundo | 2341 | 341.46 | 5462 | 850 | 325 | 6145 | 42 | 0.19 |
| West HS | 2020-01-01 | 2022-04-29 | Torrance | 3536 | 85.69 | 9333 | 400 | 123 | 1547 | 36 | 0.15 |
| Elm Ave | 2020-01-01 | 2023-09-30 | Torrance | 1362 | 5.34 | 5966 | 400 | 132 | 3955 | 32 | 0.07 |
| North HS | 2020-01-01 | 2022-04-29 | Torrance | 1779 | 160.72 | 8569 | 400 | 145 | 4858 | 24 | 0.15 |
| Guenser Park | 2020-04-23 | 2023-09-30 | Torrance | 2400 | 220.25 | 7702 | 400 | 159 | 375 | 16 | 0.14 |
| 213th & Chico | 2021-10-14 | 2022-01-28 | Marathon Carson | 2879 | 145.62 | 4297 | 400 | 213 | 50 | 7 | 0.12 |
| Judson | 2020-02-25 | 2023-09-30 | Marathon Carson | 2715 | 112.45 | 2692 | 400 | 213 | 1481 | 13 | 0.14 |
| Harbor Park | 2020-01-01 | 2023-09-30 | Phillips 66 | 1463 | 183.71 | 2012 | 400 | 6 | 4262 | 12 | 0.60 |
| First Methodist | 2020-03-04 | 2023-09-30 | Phillips 66 | 1124 | 205.55 | 2456 | 400 | 355 | 3792 | 14 | 0.21 |
| G Street | 2021-01-20 | 2023-09-30 | Phillips 66 | 717 | 222.51 | 2940 | 400 | 356 | 3748 | 8 | 0.09 |
| St. Luke | 2020-02-18 | 2023-09-30 | Marathon Carson | 2768 | 260.38 | 6910 | 400 | 256 | 1790 | 10 | 0.17 |
| Hudson | 2020-01-01 | 2023-09-30 | Marathon Wilmington | 1378 | 240.55 | 5920 | 400 | 272 | 705 | 8 | 0.14 |
| Inner Port | 2020-04-22 | 2023-09-30 | Valero | 2022 | 260.92 | 5970 | 15 | 228 | 1937 | 5 | 0.04 |
sincefeb2022_stat <- daily_full %>%
filter(day > '2022-01-31') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(sincefeb2022_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 608 | 140.15 | 3.41 | 2.45 | 0.52 | 7.83 | 236.36 | 4.65 | 1.96 |
| Manhattan | 599 | 6.82 | 1.40 | 0.82 | 0.39 | 3.40 | 233.30 | 0.04 | 0.00 |
| West HS | 88 | 2.90 | 0.88 | 1.09 | 0.40 | 6.24 | 272.30 | 0.03 | 3.00 |
| Elm Ave | 608 | 5.10 | 3.10 | 1.60 | 0.89 | 4.53 | 243.43 | 0.04 | 2.00 |
| North HS | 89 | 4.00 | 1.61 | 1.37 | 0.51 | 5.36 | 240.86 | 0.00 | 0.00 |
| Guenser Park | 546 | 7.66 | 2.66 | 1.76 | 0.80 | 4.43 | 260.55 | 0.02 | 0.00 |
| Judson | 583 | 9.75 | 2.80 | 1.69 | 0.54 | 3.80 | 272.41 | 0.53 | 0.00 |
| Harbor Park | 414 | 9.65 | 2.48 | 1.85 | 0.50 | 2.89 | 296.16 | 0.07 | 44.30 |
| First Methodist | 606 | 14.72 | 2.34 | 2.33 | 0.71 | 3.27 | 269.13 | 0.11 | 29.22 |
| G Street | 607 | 39.18 | 2.72 | 3.82 | 0.83 | 4.96 | 277.89 | 0.11 | 19.36 |
| St. Luke | 602 | 11.62 | 3.43 | 2.25 | 0.74 | 3.66 | 291.26 | 0.23 | 5.00 |
| Hudson | 599 | 98.17 | 6.08 | 2.95 | 0.99 | 3.49 | 39.57 | 0.23 | 2.70 |
| Inner Port | 582 | 53.70 | 5.39 | 6.31 | 0.87 | 5.07 | 227.02 | 0.05 | 83.81 |
disaster_stat <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(disaster_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 92 | 20.21 | 2.41 | 3.01 | 0.69 | 7.07 | 343.87 | 2.96 | 3.00 |
| Manhattan | 92 | 25.45 | 4.06 | 3.48 | 1.51 | 2.74 | 212.03 | 0.11 | 0.00 |
| West HS | 92 | 41.50 | 4.87 | 3.50 | 0.74 | 3.95 | 239.76 | 0.53 | 3.00 |
| Elm Ave | 92 | 63.50 | 8.15 | 7.73 | 1.34 | 3.77 | 236.77 | 0.53 | 2.00 |
| North HS | 92 | 98.50 | 7.88 | 6.93 | 1.15 | 3.74 | 235.24 | 0.46 | 0.00 |
| Guenser Park | 92 | 211.67 | 14.48 | 14.64 | 1.63 | 3.51 | 276.58 | 0.46 | 0.00 |
| 213th & Chico | 79 | 13407.18 | 1639.53 | 1025.43 | 128.07 | 3.55 | 293.63 | 17.15 | 0.00 |
| Judson | 92 | 742.25 | 69.08 | 59.79 | 6.86 | 2.77 | 294.11 | 31.71 | 0.00 |
| Harbor Park | 92 | 75.93 | 9.46 | 8.51 | 1.20 | 2.30 | 306.88 | 0.49 | 42.99 |
| First Methodist | 92 | 149.47 | 14.11 | 10.92 | 1.74 | 2.62 | 296.60 | 1.64 | 30.33 |
| G Street | 92 | 48.67 | 6.61 | 9.15 | 1.75 | 3.58 | 334.25 | 1.64 | 18.33 |
| St. Luke | 92 | 119.72 | 12.26 | 10.91 | 2.43 | 3.07 | 351.66 | 0.91 | 3.00 |
| Hudson | 92 | 192.64 | 19.45 | 16.72 | 3.36 | 2.53 | 13.51 | 0.91 | 2.34 |
| Inner Port | 90 | 136.50 | 12.21 | 15.91 | 3.18 | 3.47 | 9.29 | 0.09 | 86.34 |
# Try only the october for the prediction map
disaster_oct_stat <- daily_full %>%
filter(year == '2021', month == '10') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(disaster_oct_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 31 | 20.21 | 2.41 | 3.89 | 0.73 | 7.49 | 304.48 | 6.39 | 3 |
| Manhattan | 31 | 25.45 | 4.06 | 5.47 | 1.77 | 2.91 | 225.92 | 0.29 | 0 |
| West HS | 31 | 41.50 | 4.87 | 7.56 | 1.06 | 4.35 | 231.91 | 1.42 | 3 |
| Elm Ave | 31 | 63.50 | 8.15 | 18.37 | 2.37 | 4.16 | 238.62 | 1.42 | 2 |
| North HS | 31 | 98.50 | 7.88 | 16.83 | 2.02 | 4.11 | 236.18 | 1.32 | 0 |
| Guenser Park | 31 | 211.67 | 14.48 | 37.77 | 3.12 | 3.88 | 263.18 | 1.32 | 0 |
| 213th & Chico | 18 | 13407.18 | 1639.53 | 4337.41 | 536.21 | 4.41 | 260.39 | 49.44 | 0 |
| Judson | 31 | 742.25 | 69.08 | 167.25 | 17.18 | 2.91 | 290.90 | 79.10 | 0 |
| Harbor Park | 31 | 75.93 | 9.46 | 20.66 | 2.63 | 2.29 | 301.72 | 1.23 | 40 |
| First Methodist | 31 | 149.47 | 14.11 | 25.88 | 3.25 | 2.67 | 288.59 | 4.65 | 29 |
| G Street | 31 | 47.27 | 6.61 | 14.01 | 2.41 | 3.65 | 323.75 | 4.65 | 17 |
| St. Luke | 31 | 119.72 | 12.26 | 22.38 | 4.26 | 3.23 | 330.94 | 1.77 | 3 |
| Hudson | 31 | 192.64 | 19.45 | 39.27 | 5.95 | 2.76 | 343.90 | 1.77 | 2 |
| Inner Port | 31 | 136.50 | 12.21 | 22.60 | 3.79 | 3.87 | 325.23 | 0.26 | 87 |
normal_stat <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(normal_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| El Segundo | 223 | 9.24 | 3.67 | 1.70 | 0.57 | 7.92 | 255.36 | 0.25 | 2.19 |
| St. Anthony | 1160 | 140.15 | 3.43 | 2.64 | 0.57 | 7.76 | 243.55 | 4.63 | 1.98 |
| Manhattan | 1180 | 6.82 | 2.38 | 1.03 | 0.53 | 4.02 | 241.77 | 0.06 | 0.00 |
| West HS | 755 | 18.50 | 3.01 | 1.54 | 0.60 | 5.40 | 247.07 | 0.04 | 1.91 |
| Elm Ave | 1268 | 11.40 | 3.20 | 1.87 | 0.92 | 4.53 | 242.45 | 0.04 | 1.86 |
| North HS | 758 | 30.60 | 2.71 | 1.91 | 1.02 | 4.62 | 241.06 | 0.01 | 0.00 |
| Guenser Park | 1073 | 16.18 | 2.66 | 1.50 | 0.62 | 4.55 | 259.26 | 0.02 | 0.00 |
| 213th & Chico | 28 | 14.27 | 3.92 | 4.78 | 3.12 | 2.34 | 312.27 | 2.75 | 0.00 |
| Judson | 1186 | 15.78 | 3.63 | 1.99 | 0.72 | 4.41 | 266.20 | 0.43 | 0.00 |
| Harbor Park | 1018 | 17.42 | 2.67 | 1.76 | 0.47 | 3.37 | 283.50 | 0.07 | 46.82 |
| First Methodist | 1208 | 20.91 | 4.18 | 2.42 | 0.76 | 3.81 | 263.94 | 0.16 | 30.09 |
| G Street | 883 | 382.76 | 18.66 | 4.13 | 0.83 | 4.74 | 297.94 | 0.13 | 19.30 |
| St. Luke | 1177 | 11.62 | 3.43 | 2.09 | 0.69 | 3.94 | 270.65 | 0.16 | 3.86 |
| Hudson | 1266 | 98.17 | 6.08 | 2.87 | 0.98 | 3.73 | 254.80 | 0.15 | 3.15 |
| Inner Port | 1139 | 53.70 | 5.39 | 5.54 | 0.84 | 4.82 | 229.05 | 0.05 | 88.32 |
# Try only the Oct 2022
normal_oct_stat <- daily_full %>%
filter(year == '2022' & month == '10') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(normal_oct_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 31 | 4.24 | 1.54 | 2.12 | 0.98 | 6.65 | 259.53 | 14.13 | 3 |
| Manhattan | 23 | 1.95 | 0.89 | 0.96 | 0.46 | 2.68 | 224.60 | 0.00 | 0 |
| Elm Ave | 31 | 3.48 | 1.59 | 1.70 | 1.01 | 3.45 | 239.50 | 0.19 | 2 |
| Guenser Park | 31 | 6.50 | 2.02 | 2.59 | 1.23 | 3.14 | 266.04 | 0.00 | 0 |
| Judson | 31 | 2.97 | 0.63 | 0.77 | 0.25 | 2.59 | 284.61 | 1.16 | 0 |
| Harbor Park | 28 | 7.04 | 0.90 | 2.25 | 0.42 | 2.24 | 293.73 | 0.29 | 45 |
| First Methodist | 31 | 8.42 | 2.09 | 2.60 | 0.73 | 2.66 | 275.98 | 0.26 | 30 |
| G Street | 31 | 11.77 | 1.73 | 4.07 | 1.00 | 3.50 | 331.32 | 0.26 | 20 |
| St. Luke | 31 | 6.40 | 1.64 | 2.01 | 0.51 | 2.78 | 8.03 | 0.65 | 5 |
| Hudson | 29 | 13.34 | 2.12 | 3.18 | 1.20 | 2.71 | 57.46 | 0.62 | 3 |
| Inner Port | 31 | 52.63 | 2.07 | 6.95 | 0.80 | 3.02 | 161.12 | 0.00 | 80 |
table1 <- base_monitor_stat %>%
select(-c(`Angle to Refinery`, `Angle to WRP`, `Capacity of Nearest WRP`)) %>%
left_join(disaster_stat %>%
select(Monitor, `Avg Daily Average`) %>%
rename(`Disaster Avg Daily Average` = `Avg Daily Average`),
join_by(Monitor)) %>%
left_join(normal_stat %>%
select(Monitor, `Avg Daily Average`, `Average daily odor complaints within zipcode`) %>%
rename(`Normal Avg Daily Average` = `Avg Daily Average`,
`Normal Avg Daily odor complaints` = `Average daily odor complaints within zipcode`),
join_by(Monitor)) %>%
mutate(`Closest Refinery` = paste0(`Closest Refinery`, ' (', round(`Distance to Nearest Refinery (m)`/1000, 1), ')'),
'#' = 1:n()) %>%
select(-`Distance to Nearest Refinery (m)`) %>%
relocate('#', Monitor, `Start Date`, `End Date`, `Closest Refinery`, `Normal Avg Daily Average`, `Disaster Avg Daily Average`, `Normal Avg Daily odor complaints`)
table1_kable <- knitr::kable(table1, format = 'latex', digits = 2)
writeLines(table1_kable, '../figures/table1.tex')
knitr::kable(table1, format = 'html', digits = 2)
| # | Monitor | Start Date | End Date | Closest Refinery | Normal Avg Daily Average | Disaster Avg Daily Average | Normal Avg Daily odor complaints | Distance to Nearest WRP (m) | Distance to Dominguez Channel (m) | Elevation | Enhanced Vegetation Index |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | El Segundo | 2020-01-01 | 2020-08-13 | Chevron El Segundo (1.2) | 0.57 | NA | 0.25 | 2810 | 6437 | 60 | 0.17 |
| 2 | St. Anthony | 2020-04-23 | 2023-09-30 | Chevron El Segundo (1) | 0.57 | 0.69 | 4.63 | 2825 | 6543 | 44 | 0.17 |
| 3 | Manhattan | 2020-03-24 | 2023-09-30 | Chevron El Segundo (2.3) | 0.53 | 1.51 | 0.06 | 5462 | 6145 | 42 | 0.19 |
| 4 | West HS | 2020-01-01 | 2022-04-29 | Torrance (3.5) | 0.60 | 0.74 | 0.04 | 9333 | 1547 | 36 | 0.15 |
| 5 | Elm Ave | 2020-01-01 | 2023-09-30 | Torrance (1.4) | 0.92 | 1.34 | 0.04 | 5966 | 3955 | 32 | 0.07 |
| 6 | North HS | 2020-01-01 | 2022-04-29 | Torrance (1.8) | 1.02 | 1.15 | 0.01 | 8569 | 4858 | 24 | 0.15 |
| 7 | Guenser Park | 2020-04-23 | 2023-09-30 | Torrance (2.4) | 0.62 | 1.63 | 0.02 | 7702 | 375 | 16 | 0.14 |
| 8 | 213th & Chico | 2021-10-14 | 2022-01-28 | Marathon Carson (2.9) | 3.12 | 128.07 | 2.75 | 4297 | 50 | 7 | 0.12 |
| 9 | Judson | 2020-02-25 | 2023-09-30 | Marathon Carson (2.7) | 0.72 | 6.86 | 0.43 | 2692 | 1481 | 13 | 0.14 |
| 10 | Harbor Park | 2020-01-01 | 2023-09-30 | Phillips 66 (1.5) | 0.47 | 1.20 | 0.07 | 2012 | 4262 | 12 | 0.60 |
| 11 | First Methodist | 2020-03-04 | 2023-09-30 | Phillips 66 (1.1) | 0.76 | 1.74 | 0.16 | 2456 | 3792 | 14 | 0.21 |
| 12 | G Street | 2021-01-20 | 2023-09-30 | Phillips 66 (0.7) | 0.83 | 1.75 | 0.13 | 2940 | 3748 | 8 | 0.09 |
| 13 | St. Luke | 2020-02-18 | 2023-09-30 | Marathon Carson (2.8) | 0.69 | 2.43 | 0.16 | 6910 | 1790 | 10 | 0.17 |
| 14 | Hudson | 2020-01-01 | 2023-09-30 | Marathon Wilmington (1.4) | 0.98 | 3.36 | 0.15 | 5920 | 705 | 8 | 0.14 |
| 15 | Inner Port | 2020-04-22 | 2023-09-30 | Valero (2) | 0.84 | 3.18 | 0.05 | 5970 | 1937 | 5 | 0.04 |
hourly_responses <- c('H2S_hourly_avg', 'H2S_hourly_max')
# since feb 2022
daily_data_sincefeb2022 <- daily_full %>% filter(day > '2022-01-31')
hourly_data_sincefeb2022 <- hourly_full %>% filter(day > '2022-01-31')
# Disaster
daily_data_dis <- daily_full %>% filter(year == '2021', month %in% c('10', '11', '12'))
hourly_data_dis <- hourly_full %>% filter(year == '2021', month %in% c('10', '11', '12'))
# Exclude disaster stepwise
daily_data_excl_dis <- daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))
hourly_data_excl_dis <- hourly_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))
# Everything w. disaster indicator
daily_data_dis_ind <- daily_full %>%
mutate(disaster =
if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))
hourly_data_dis_ind <- hourly_full %>%
mutate(disaster =
if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))
daily_responses <- c('H2S_daily_avg', 'log(H2S_daily_avg)',
'H2S_daily_max', 'log(H2S_daily_max)')
hourly_responses <- c('H2S_hourly_avg', 'log(H2S_hourly_avg)',
'H2S_hourly_max', 'log(H2S_hourly_max)')
dateranges <- c('sincefeb2022', 'dis', 'excl_dis', 'dis_ind', 'full')
smooth <- c("s(as.numeric(month),bs='cc')",
"s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
"te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")
smooth_tibble <- tibble(features = c(list(c(smooth[1])),
list(c(smooth[2])),
list(c(smooth[3])),
list(c(smooth[1:2])),
list(c(smooth[c(1, 3)])),
list(c(smooth[2:3])),
list(c(smooth[1:3]))),
disaster_applicable = c(0, 1, 1, 0, 0, 1, 0))
# smooth_compare <- crossing(response = c(daily_responses, hourly_responses),
# daterange = dateranges) %>%
# cross_join(smooth_tibble) %>%
# mutate(GCV = NA) %>%
# filter(!(daterange == 'dis' & disaster_applicable == 0)) %>%
# select(-disaster_applicable)
#
# for (i in 1:nrow(smooth_compare)) {
# features <- unname(unlist(smooth_compare[i,'features']))
# formula_feature_str <- paste(features, collapse = ' + ')
# formula_str <- paste(smooth_compare[[i, 'response']], formula_feature_str, sep = ' ~ ')
# formula <- as.formula(formula_str)
#
# if (smooth_compare[[i, 'response']] %in% hourly_responses &
# smooth_compare[[i, 'daterange']] == 'full') {
# data <- hourly_full
# } else if (smooth_compare[[i, 'response']] %in% hourly_responses){
# data <- get(paste0('hourly_data_', smooth_compare[[i, 'daterange']]))
# } else if (smooth_compare[[i, 'daterange']] == 'full') {
# data <- daily_full
# } else {
# data <- get(paste0('daily_data_', smooth_compare[[i, 'daterange']]))
# }
#
# summary <- summary(gam(formula, data = data, method = 'GCV.Cp', select = TRUE))
# GCV_new <- summary$sp.criterion[[1]]
# smooth_compare[i, 'GCV'] <- GCV_new
# print(str_glue('Completed {i} iterations'))
# gc()
# }
#
# smooth_compare <- smooth_compare %>%
# group_by(response, daterange) %>%
# mutate(best = if_else(GCV == min(GCV), 1, 0)) %>%
# mutate(rounded_GCV = round(GCV, 2)) %>%
# rowwise() %>%
# mutate(month_smooth = if_else(smooth[1] %in% unlist(features), 1, 0),
# coord_smooth = if_else(smooth[2] %in% unlist(features), 1, 0),
# coord_day_3D_smooth = if_else(smooth[3] %in% unlist(features), 1, 0)) %>%
# ungroup()
# saveRDS(smooth_compare, 'smooth_compare.rds')
smooth_compare <- readRDS('smooth_compare.rds')
# get best smooth models for diff response and daterange
best_smooth <- smooth_compare %>%
group_by(response, daterange) %>%
filter(GCV == min(GCV)) %>%
ungroup() %>%
select(response, daterange, GCV, features, month_smooth, coord_smooth, coord_day_3D_smooth)
best_smooth
response_names <- c('da', 'log_da', 'dm', 'log_dm', 'ha', 'log_ha', 'hm', 'log_hm')
# smooth_models <- tibble(name = response_names,
# response = c(daily_responses, hourly_responses)) %>%
# crossing(tibble(daterange = dateranges)) %>%
# mutate(name = paste(name, daterange, sep = '_'))
smooth_predictors <-
c("s(as.numeric(month),bs='cc')",
"s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
"te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")
# smooth_models$residuals <- rep(list(c()), nrow(smooth_models))
#
# for (i in 1:nrow(smooth_models)) {
# # first, get residuals of smooth on response
# if (smooth_models$daterange[i] == 'dis') {
# formula_feature_str <- paste(smooth_predictors[c(2, 3)], collapse = ' + ')
# } else {
# formula_feature_str <- paste(smooth_predictors, collapse = ' + ')
# }
#
# if (smooth_models[[i, 'response']] %in% hourly_responses &
# smooth_models[[i, 'daterange']] == 'full') {
# data <- hourly_full
# } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
# data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
# } else if (smooth_models[[i, 'daterange']] == 'full') {
# data <- daily_full
# } else {
# data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
# }
#
# formula_str <- paste(smooth_models[[i, 'response']], formula_feature_str, sep = ' ~ ')
# formula <- as.formula(formula_str)
# residuals <- gam(formula, data = data, method = 'GCV.Cp')$residuals
# smooth_models$residuals[i] <- list(residuals)
# }
# saveRDS(smooth_models, 'smooth_models.rds')
smooth_models <- readRDS('smooth_models.rds')
disaster_predictors <- c('month', 'weekday', 'wd_avg', 'ws_avg',
'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
'monthly_oil_2km', 'monthly_gas_2km', 'active_2km',
'inactive_2km', 'elevation', 'EVI',
'num_odor_complaints', 'closest_wrp_capacity')
everything_predictors <- c('year', 'weekday', 'wd_avg', 'ws_avg',
'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
'monthly_oil_2km', 'monthly_gas_2km', 'active_2km',
'inactive_2km', 'elevation', 'EVI',
'num_odor_complaints', 'closest_wrp_capacity')
daily_predictors <- c('daily_downwind_ref', 'daily_downwind_wrp', 'daily_temp',
'daily_hum', 'daily_precip')
hourly_predictors <- c('hourly_downwind_ref', 'hourly_downwind_wrp', 'hourly_temp',
'hourly_hum', 'hourly_precip')
disaster_linear_pred_str <- paste(disaster_predictors, collapse = ' + ')
everything_linear_pred_str <- paste(everything_predictors, collapse = ' + ')
daily_linear_pred_str <- paste(daily_predictors, collapse = ' + ')
hourly_linear_pred_str <- paste(hourly_predictors, collapse = ' + ')
# for (i in 1:nrow(smooth_models)) {
# if (smooth_models[[i, 'daterange']] == 'dis_ind' &
# smooth_models[[i, 'response']] %in% hourly_responses) {
# formula_str <- paste(everything_linear_pred_str,
# hourly_linear_pred_str,
# 'disaster', sep = ' + ')
# print(paste0(smooth_models[[i, 'daterange']], ': everything hourly + ind'))
# } else if (smooth_models[[i, 'daterange']] == 'dis' &
# smooth_models[[i, 'response']] %in% hourly_responses) {
# formula_str <- paste(disaster_linear_pred_str,
# hourly_linear_pred_str, sep = ' + ')
# print(paste0(smooth_models[[i, 'daterange']], ': disaster hourly'))
# } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
# formula_str <- paste(everything_linear_pred_str,
# hourly_linear_pred_str, sep = ' + ')
# print(paste0(smooth_models[[i, 'daterange']], ': everything hourly'))
# } else if (smooth_models[[i, 'daterange']] == 'dis_ind') {
# formula_str <- paste(everything_linear_pred_str,
# daily_linear_pred_str,
# 'disaster', sep = ' + ')
# print(paste0(smooth_models[[i, 'daterange']], ': everything daily + ind'))
# } else if (smooth_models[[i, 'daterange']] == 'dis') {
# formula_str <- paste(disaster_linear_pred_str,
# daily_linear_pred_str, sep = ' + ')
# print(paste0(smooth_models[[i, 'daterange']], ': disaster daily'))
# } else {
# formula_str <- paste(everything_linear_pred_str,
# daily_linear_pred_str, sep = ' + ')
# print(paste0(smooth_models[[i, 'daterange']], ': everything daily'))
# }
# formula_str <- paste('residuals', formula_str, sep = ' ~ ')
# formula <- as.formula(formula_str)
#
# if (smooth_models[[i, 'response']] %in% hourly_responses &
# smooth_models[[i, 'daterange']] == 'full') {
# data <- hourly_full
# } else if (smooth_models[[i, 'response']] %in% hourly_responses){
# data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
# } else if (smooth_models[[i, 'daterange']] == 'full') {
# data <- daily_full
# } else {
# data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
# }
#
# data$residuals <- unlist(smooth_models$residuals[i])
# regsubsets <- regsubsets(formula, data, nvmax = Inf)
# assign(paste0(smooth_models$name[i], '_regsubsets'), regsubsets)
# print(str_glue('Completed {i} rows'))
# }
#
# for (i in 1:nrow(smooth_models)) {
# saveRDS(get(paste0(smooth_models$name[i], '_regsubsets')),
# paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds'))
# }
# read regsubsets
for (i in 1:nrow(smooth_models)) {
assign(paste0(smooth_models$name[i], '_regsubsets'),
readRDS(paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds')))
}
# best_model_sizes <- smooth_models %>%
# group_by(name, response, daterange) %>%
# summarise(mean_smooth_res = mean(unlist(residuals)),
# var_smooth_res = round(var(unlist(residuals))), 4) %>%
# select(any_of(c('name', 'response', 'daterange',
# 'mean_smooth_res', 'var_smooth_res'))) %>%
# mutate(Adj.R2 = NA,
# best_R2 = NA,
# CP = NA,
# best_CP = NA,
# BIC = NA,
# best_BIC = NA,
# linear_features = NA)
# for (i in 1:nrow(smooth_models)) {
# regsubset <- summary(get(paste0(smooth_models$name[i], '_regsubsets')))
# best_sizes <- tibble(Adj.R2 = which.max(regsubset$adjr2),
# best_R2 = regsubset$adjr2[which.max(regsubset$adjr2)],
# CP = which.min(regsubset$cp),
# best_CP = regsubset$cp[which.max(regsubset$cp)],
# BIC = which.min(regsubset$bic),
# best_BIC = regsubset$bic[which.max(regsubset$bic)],)
# best_features <- tibble(linear_features = list(setdiff(names(regsubset$which[best_sizes$CP, ]
# [unlist(regsubset$which[best_sizes$CP, ])]), '(Intercept)')))
# best_model_sizes[i,] <- bind_cols(best_model_sizes[i, 1:5], best_sizes, best_features)
# }
#
# best_models <- best_model_sizes %>%
# rowwise() %>%
# mutate(smooth_features = if_else(daterange == 'dis', list(smooth_predictors[2:3]),
# list(smooth_predictors))) %>%
# mutate(full_features = list(c(unlist(smooth_features), unlist(linear_features)))) %>%
# ungroup()
#
# saveRDS(best_models, 'best_gam_models.rds')
best_gam_models <- readRDS('best_gam_models.rds')
# write function that takes in response, predictors, data and returns gam model
# different from stepwise function, this has select = FALSE
get_feature_vector <- function(response, daterange) {
feature_vec <- best_gam_models %>%
filter(response == .env$response & daterange == .env$daterange) %>%
pull(full_features) %>%
unlist()
feature_vec <- str_replace_all(feature_vec, 'month\\d+', 'month')
feature_vec <- str_replace_all(feature_vec, 'year\\d+', 'year')
feature_vec <- str_replace_all(feature_vec, 'weekday\\D+', 'weekday')
feature_vec <- unique(feature_vec)
return(feature_vec)
}
get_data <- function(response, daterange) {
if (response %in% hourly_responses &
daterange == 'full') {
data <- hourly_full
} else if (response %in% hourly_responses){
data <- get(paste0('hourly_data_', daterange))
} else if (daterange == 'full') {
data <- daily_full
} else {
data <- get(paste0('daily_data_', daterange))
}
return(data)
}
get_gam_model <- function(response, daterange) {
predictors <- get_feature_vector(response, daterange)
formula_feature_str <- paste(predictors, collapse = ' + ')
formula_str <- paste(response, formula_feature_str, sep = ' ~ ')
formula <- as.formula(formula_str)
data <- get_data(response, daterange)
gam_model <- gam(formula, data = data, method = 'GCV.Cp')
return(gam_model)
}
# for (i in 1:nrow(best_gam_models)) {
# model <- get_gam_model(best_gam_models$response[i],
# best_gam_models$daterange[i])
# assign(paste0(best_gam_models$name[i], '_', best_gam_models$daterange[i], '_gam'), model)
# saveRDS(model, paste0('gam_models/', paste0(best_gam_models$name[i], '_gam.rds')))
# }
# # compare empty init vs full init and find best ones
# final_model_features_table <- model_features_table %>%
# group_by(stat, date) %>%
# filter(GCV == min(GCV))
#
# final_model_features_table <- final_model_features_table %>%
# select(-c(init, model_name)) %>%
# distinct()
for (i in 1:nrow(best_gam_models)) {
assign(paste0(best_gam_models$name[i], '_gam'),
readRDS(paste0('gam_models/', best_gam_models$name[i], '_gam.rds')))
}
Since February 2022
# Since feb 2022
summary(da_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km +
## elevation + EVI + num_odor_complaints + daily_downwind_ref +
## daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.360e+00 1.377e-01 17.134 < 2e-16 ***
## year2023 -1.502e-01 4.090e-02 -3.671 0.000243 ***
## weekdayMon 8.501e-02 1.464e-02 5.808 6.61e-09 ***
## weekdayTue 1.492e-01 1.461e-02 10.210 < 2e-16 ***
## weekdayWed 1.600e-01 1.465e-02 10.925 < 2e-16 ***
## weekdayThu 1.112e-01 1.466e-02 7.588 3.71e-14 ***
## weekdayFri 1.301e-01 1.462e-02 8.901 < 2e-16 ***
## weekdaySat 6.965e-02 1.461e-02 4.767 1.91e-06 ***
## wd_avg 2.874e-04 5.319e-05 5.403 6.80e-08 ***
## ws_avg -6.768e-02 2.767e-03 -24.464 < 2e-16 ***
## I(1/dist_wrp^2) 6.042e-07 3.582e-07 1.686 0.091755 .
## I(1/dist_ref^2) 1.458e-05 4.195e-06 3.475 0.000514 ***
## I(1/dist_dc^2) -2.852e-04 1.095e-04 -2.605 0.009221 **
## monthly_gas_2km 4.023e-05 9.376e-06 4.291 1.81e-05 ***
## elevation -3.687e-02 4.511e-03 -8.172 3.63e-16 ***
## EVI -1.343e+00 6.496e-02 -20.679 < 2e-16 ***
## num_odor_complaints 8.478e-03 1.877e-03 4.517 6.37e-06 ***
## daily_downwind_ref -5.009e-03 1.620e-02 -0.309 0.757170
## daily_temp 2.439e-03 1.418e-03 1.720 0.085482 .
## daily_hum -9.993e-03 3.923e-04 -25.473 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 6.901 8.000 5.193
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.943 8.997 38.764
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.859 80.000 38.890
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 114/117
## R-sq.(adj) = 0.618 Deviance explained = 62.4%
## GCV = 0.10034 Scale est. = 0.098606 n = 6531
Disaster Only
# Disaster only
summary(da_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + weekday +
## wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.242e+01 1.240e+01 2.615 0.009040 **
## weekdayMon -6.532e+00 7.104e+00 -0.920 0.357985
## weekdayTue -1.077e+01 7.280e+00 -1.479 0.139444
## weekdayWed 4.488e-01 7.088e+00 0.063 0.949527
## weekdayThu -3.571e-01 7.076e+00 -0.050 0.959766
## weekdayFri 1.919e+00 6.917e+00 0.277 0.781522
## weekdaySat 1.194e+00 7.040e+00 0.170 0.865381
## wd_avg -6.137e-02 2.086e-02 -2.941 0.003332 **
## ws_avg 6.722e+00 1.791e+00 3.753 0.000183 ***
## I(1/dist_wrp^2) -1.034e-04 5.137e-05 -2.012 0.044470 *
## I(1/dist_ref^2) 1.197e-03 6.008e-04 1.993 0.046481 *
## I(1/dist_dc^2) 3.169e-01 1.510e-01 2.098 0.036104 *
## monthly_oil_2km -5.058e-03 1.771e-03 -2.857 0.004358 **
## inactive_2km 4.246e+00 1.757e+00 2.417 0.015818 *
## num_odor_complaints -1.072e+00 1.618e-01 -6.624 5.3e-11 ***
## daily_hum -2.316e-01 1.185e-01 -1.954 0.050880 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.313 8.838 3.189
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 67.903 80.000 8.117
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9e-04 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 102/105
## R-sq.(adj) = 0.417 Deviance explained = 45.7%
## GCV = 4760.8 Scale est. = 4427.1 n = 1273
Exclude Disaster
# Exclude disaster
summary(da_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_dc^2) + monthly_oil_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.336e+00 1.149e-01 11.632 < 2e-16 ***
## year2021 1.601e-02 4.026e-02 0.398 0.69090
## year2022 4.588e-01 6.379e-02 7.193 6.67e-13 ***
## year2023 4.155e-01 6.146e-02 6.759 1.44e-11 ***
## weekdayMon 9.196e-02 1.360e-02 6.764 1.40e-11 ***
## weekdayTue 1.457e-01 1.355e-02 10.757 < 2e-16 ***
## weekdayWed 1.716e-01 1.354e-02 12.671 < 2e-16 ***
## weekdayThu 1.524e-01 1.355e-02 11.253 < 2e-16 ***
## weekdayFri 1.479e-01 1.355e-02 10.916 < 2e-16 ***
## weekdaySat 7.646e-02 1.356e-02 5.639 1.74e-08 ***
## ws_avg -2.865e-02 1.914e-03 -14.971 < 2e-16 ***
## I(1/dist_wrp^2) 6.345e-07 3.319e-07 1.912 0.05590 .
## I(1/dist_dc^2) 4.828e-04 2.112e-04 2.286 0.02228 *
## monthly_oil_2km 4.078e-06 4.286e-06 0.952 0.34136
## active_2km 1.040e-02 2.562e-03 4.058 4.97e-05 ***
## inactive_2km -8.128e-03 5.917e-03 -1.374 0.16957
## elevation -7.787e-03 2.782e-03 -2.798 0.00514 **
## EVI -1.565e+00 9.635e-02 -16.245 < 2e-16 ***
## num_odor_complaints 4.772e-03 1.044e-03 4.569 4.94e-06 ***
## daily_downwind_ref -3.253e-02 1.420e-02 -2.290 0.02203 *
## daily_temp 2.445e-03 1.226e-03 1.994 0.04613 *
## daily_hum -9.020e-03 3.402e-04 -26.510 < 2e-16 ***
## daily_precip -6.312e-02 2.599e-02 -2.428 0.01518 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.909 8 107.91
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 42.43
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.630 80 50.40
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/120
## R-sq.(adj) = 0.459 Deviance explained = 46.3%
## GCV = 0.18824 Scale est. = 0.1867 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(da_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + wd_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.153e+02 1.682e+01 -6.854 7.48e-12 ***
## year2021 1.107e+01 2.264e+00 4.891 1.02e-06 ***
## year2022 1.349e+01 2.925e+00 4.612 4.02e-06 ***
## year2023 9.270e+00 2.703e+00 3.429 0.000606 ***
## wd_avg -1.067e-02 2.383e-03 -4.476 7.65e-06 ***
## I(1/dist_wrp^2) -2.361e-05 5.243e-06 -4.503 6.74e-06 ***
## I(1/dist_ref^2) -7.832e-04 2.288e-04 -3.423 0.000622 ***
## I(1/dist_dc^2) 1.038e-01 1.910e-02 5.436 5.54e-08 ***
## monthly_oil_2km 2.611e-04 2.487e-04 1.050 0.293757
## active_2km 7.209e-01 1.122e-01 6.427 1.34e-10 ***
## elevation -6.395e-01 1.300e-01 -4.921 8.71e-07 ***
## EVI -3.477e+01 5.784e+00 -6.013 1.87e-09 ***
## num_odor_complaints 6.429e-01 3.073e-02 20.920 < 2e-16 ***
## closest_wrp_capacity 2.489e-01 3.446e-02 7.222 5.38e-13 ***
## daily_downwind_ref -3.257e+00 7.296e-01 -4.464 8.09e-06 ***
## daily_downwind_wrp 1.003e+00 7.885e-01 1.272 0.203454
## daily_hum -6.404e-02 1.501e-02 -4.268 1.99e-05 ***
## disaster 1.334e+01 1.874e+00 7.116 1.16e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.223 8.000 3.920
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.918 8.978 7.176
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 73.982 80.000 8.124
## p-value
## s(as.numeric(month)) 3.17e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 112/115
## R-sq.(adj) = 0.114 Deviance explained = 12%
## GCV = 558.69 Scale est. = 554.93 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(da_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + wd_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.133e+01 1.643e+01 -5.560 2.74e-08 ***
## year2021 1.432e+00 1.818e+00 0.788 0.4309
## year2022 1.369e+00 2.382e+00 0.575 0.5655
## year2023 1.471e+00 2.473e+00 0.595 0.5521
## wd_avg -1.045e-02 2.387e-03 -4.377 1.21e-05 ***
## I(1/dist_wrp^2) -2.190e-05 5.392e-06 -4.062 4.89e-05 ***
## I(1/dist_ref^2) -5.814e-04 2.290e-04 -2.539 0.0111 *
## I(1/dist_dc^2) 8.553e-02 1.867e-02 4.582 4.63e-06 ***
## monthly_oil_2km 3.076e-04 2.482e-04 1.239 0.2154
## active_2km 6.364e-01 1.117e-01 5.699 1.23e-08 ***
## elevation -6.377e-01 1.301e-01 -4.901 9.63e-07 ***
## EVI -3.090e+01 5.760e+00 -5.364 8.26e-08 ***
## num_odor_complaints 6.680e-01 3.059e-02 21.836 < 2e-16 ***
## closest_wrp_capacity 2.364e-01 3.428e-02 6.896 5.56e-12 ***
## daily_downwind_ref -3.341e+00 7.307e-01 -4.572 4.86e-06 ***
## daily_downwind_wrp 9.727e-01 7.897e-01 1.232 0.2181
## daily_hum -6.601e-02 1.502e-02 -4.394 1.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.189 8.000 4.364
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.913 8.977 6.952
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 73.797 80.000 7.824
## p-value
## s(as.numeric(month)) 3.41e-06 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.111 Deviance explained = 11.7%
## GCV = 560.48 Scale est. = 556.74 n = 15595
Since February 2022
# Since feb 2022
summary(log_da_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + inactive_2km +
## elevation + EVI + num_odor_complaints + daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.432e+00 1.981e-01 7.232 5.30e-13 ***
## year2023 -8.379e-02 5.080e-02 -1.649 0.099161 .
## weekdayMon 1.229e-01 1.816e-02 6.767 1.43e-11 ***
## weekdayTue 2.075e-01 1.813e-02 11.445 < 2e-16 ***
## weekdayWed 2.046e-01 1.817e-02 11.258 < 2e-16 ***
## weekdayThu 1.491e-01 1.819e-02 8.196 2.98e-16 ***
## weekdayFri 1.690e-01 1.814e-02 9.316 < 2e-16 ***
## weekdaySat 1.111e-01 1.813e-02 6.129 9.36e-10 ***
## wd_avg 3.863e-04 6.599e-05 5.853 5.05e-09 ***
## ws_avg -9.982e-02 3.432e-03 -29.085 < 2e-16 ***
## I(1/dist_wrp^2) 2.520e-07 4.049e-07 0.622 0.533743
## I(1/dist_ref^2) -2.302e-07 4.747e-06 -0.048 0.961325
## I(1/dist_dc^2) -2.180e-04 6.788e-05 -3.212 0.001326 **
## inactive_2km 1.669e-02 4.984e-03 3.350 0.000814 ***
## elevation -3.943e-02 7.751e-03 -5.087 3.74e-07 ***
## EVI -1.905e+00 1.451e-01 -13.124 < 2e-16 ***
## num_odor_complaints 1.194e-02 2.328e-03 5.130 2.99e-07 ***
## daily_temp 2.513e-03 1.762e-03 1.426 0.153804
## daily_hum -1.297e-02 4.847e-04 -26.769 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.093 8.00 3.389
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.595 8.23 53.608
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.875 80.00 63.580
## p-value
## s(as.numeric(month)) 0.000191 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 113/116
## R-sq.(adj) = 0.698 Deviance explained = 70.3%
## GCV = 0.15447 Scale est. = 0.15186 n = 6531
get_gam_result <- function(model_obj, model_name){
p <- summary(model_obj)$p.pv
coef <- tibble(Term = names(summary(model_obj)$p.coeff), x = summary(model_obj)$p.coeff) %>%
mutate(x = paste0(formatC(x, format = 'e', digits = 2),
case_when(p < 0.001 ~ '***',
0.001 <= p & p < 0.01 ~ '**',
0.01 <= p & p < 0.05 ~ '*',
0.05 <= p & p < 0.1 ~ '.',
TRUE ~ ''))) %>%
rename({{model_name}} := x)
coef <- rbind(coef, tibble(Term = 'Adj.R-sq', {{model_name}} :=
formatC(summary(model_obj)$r.sq, format = 'e',
digits = 2)))
return(coef)
}
sincefeb2022_model_stat <- get_gam_result(log_da_sincefeb2022_gam, 'Daily Average H2S Since Feb 2022')
sincefeb2022_model_stat_latex <- knitr::kable(sincefeb2022_model_stat, format = 'latex', digits = 2)
writeLines(sincefeb2022_model_stat_latex, '../figures/sincefeb2022_log_model.tex')
knitr::kable(sincefeb2022_model_stat, format = 'html', digits = 2)
| Term | Daily Average H2S Since Feb 2022 |
|---|---|
| (Intercept) | 1.43e+00*** |
| year2023 | -8.38e-02. |
| weekdayMon | 1.23e-01*** |
| weekdayTue | 2.08e-01*** |
| weekdayWed | 2.05e-01*** |
| weekdayThu | 1.49e-01*** |
| weekdayFri | 1.69e-01*** |
| weekdaySat | 1.11e-01*** |
| wd_avg | 3.86e-04*** |
| ws_avg | -9.98e-02*** |
| I(1/dist_wrp^2) | 2.52e-07 |
| I(1/dist_ref^2) | -2.30e-07 |
| I(1/dist_dc^2) | -2.18e-04** |
| inactive_2km | 1.67e-02*** |
| elevation | -3.94e-02*** |
| EVI | -1.90e+00*** |
| num_odor_complaints | 1.19e-02*** |
| daily_temp | 2.51e-03 |
| daily_hum | -1.30e-02*** |
| Adj.R-sq | 6.98e-01 |
Disaster Only
# Disaster only
summary(log_da_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.399e+01 5.713e+00 -2.449 0.014450 *
## month11 5.086e-01 1.310e-01 3.882 0.000109 ***
## month12 4.853e-01 1.452e-01 3.343 0.000854 ***
## weekdayMon 4.504e-02 6.325e-02 0.712 0.476553
## weekdayTue -1.303e-01 6.478e-02 -2.012 0.044445 *
## weekdayWed 1.073e-01 6.281e-02 1.708 0.087868 .
## weekdayThu 7.894e-02 6.303e-02 1.252 0.210689
## weekdayFri -9.842e-02 6.144e-02 -1.602 0.109460
## weekdaySat -1.387e-01 6.204e-02 -2.236 0.025544 *
## ws_avg -1.428e-01 1.628e-02 -8.766 < 2e-16 ***
## I(1/dist_wrp^2) -1.877e-05 7.598e-06 -2.470 0.013655 *
## I(1/dist_ref^2) -1.698e-04 7.029e-05 -2.415 0.015875 *
## I(1/dist_dc^2) 1.556e-01 6.090e-02 2.556 0.010722 *
## monthly_oil_2km 3.410e-04 1.225e-04 2.783 0.005466 **
## active_2km -4.484e-01 4.093e-02 -10.956 < 2e-16 ***
## inactive_2km 9.736e-01 1.340e-01 7.263 6.83e-13 ***
## elevation -2.777e-01 3.584e-02 -7.748 1.99e-14 ***
## num_odor_complaints 8.276e-03 1.340e-03 6.177 8.98e-10 ***
## closest_wrp_capacity 4.577e-02 1.095e-02 4.182 3.11e-05 ***
## daily_downwind_ref -8.101e-02 5.618e-02 -1.442 0.149606
## daily_temp 9.549e-03 6.866e-03 1.391 0.164564
## daily_hum -8.024e-03 1.338e-03 -5.997 2.67e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 25.693
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 58.55 80 9.163
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 108/111
## R-sq.(adj) = 0.746 Deviance explained = 76.3%
## GCV = 0.36938 Scale est. = 0.34427 n = 1273
Exclude Disaster
# Exclude disaster
summary(log_da_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.777e-04 1.276e-01 -0.003 0.997638
## year2021 5.695e-02 4.471e-02 1.274 0.202721
## year2022 2.862e-01 7.084e-02 4.040 5.36e-05 ***
## year2023 3.888e-01 6.828e-02 5.694 1.27e-08 ***
## weekdayMon 1.206e-01 1.509e-02 7.990 1.46e-15 ***
## weekdayTue 1.832e-01 1.504e-02 12.182 < 2e-16 ***
## weekdayWed 2.088e-01 1.503e-02 13.889 < 2e-16 ***
## weekdayThu 1.913e-01 1.504e-02 12.717 < 2e-16 ***
## weekdayFri 1.901e-01 1.505e-02 12.636 < 2e-16 ***
## weekdaySat 1.047e-01 1.505e-02 6.953 3.73e-12 ***
## ws_avg -3.991e-02 2.125e-03 -18.782 < 2e-16 ***
## I(1/dist_wrp^2) 1.817e-06 3.693e-07 4.918 8.83e-07 ***
## I(1/dist_ref^2) -7.611e-05 1.670e-05 -4.558 5.20e-06 ***
## I(1/dist_dc^2) -3.279e-04 3.098e-04 -1.058 0.289931
## monthly_oil_2km -2.430e-06 4.769e-06 -0.510 0.610338
## active_2km 2.162e-02 2.857e-03 7.565 4.11e-14 ***
## inactive_2km -9.643e-03 6.648e-03 -1.451 0.146930
## elevation -2.695e-03 3.091e-03 -0.872 0.383265
## EVI -2.515e+00 1.072e-01 -23.470 < 2e-16 ***
## num_odor_complaints 4.427e-03 1.160e-03 3.818 0.000135 ***
## daily_downwind_ref 5.065e-03 1.577e-02 0.321 0.748096
## daily_temp 5.239e-03 1.361e-03 3.850 0.000119 ***
## daily_hum -1.109e-02 3.777e-04 -29.360 < 2e-16 ***
## daily_precip -1.224e-01 2.886e-02 -4.242 2.23e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.904 8 102.29
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 104.05
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.878 80 86.72
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/121
## R-sq.(adj) = 0.567 Deviance explained = 57%
## GCV = 0.23204 Scale est. = 0.23013 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(log_da_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_dc^2) +
## monthly_oil_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + daily_downwind_ref + daily_temp +
## daily_hum + daily_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.736e-01 1.489e-01 -2.509 0.012120 *
## year2021 9.739e-01 5.353e-02 18.193 < 2e-16 ***
## year2022 1.040e+00 6.966e-02 14.930 < 2e-16 ***
## year2023 9.021e-01 6.496e-02 13.888 < 2e-16 ***
## weekdayMon 1.070e-01 1.628e-02 6.575 5.01e-11 ***
## weekdayTue 1.477e-01 1.623e-02 9.100 < 2e-16 ***
## weekdayWed 1.957e-01 1.621e-02 12.073 < 2e-16 ***
## weekdayThu 1.767e-01 1.623e-02 10.886 < 2e-16 ***
## weekdayFri 1.573e-01 1.620e-02 9.711 < 2e-16 ***
## weekdaySat 7.822e-02 1.623e-02 4.819 1.46e-06 ***
## ws_avg -4.139e-02 2.349e-03 -17.619 < 2e-16 ***
## I(1/dist_dc^2) 1.592e-03 2.785e-04 5.716 1.11e-08 ***
## monthly_oil_2km -1.465e-05 4.900e-06 -2.990 0.002798 **
## active_2km 2.956e-02 3.023e-03 9.778 < 2e-16 ***
## inactive_2km -2.037e-02 6.992e-03 -2.913 0.003587 **
## elevation -1.096e-02 3.387e-03 -3.237 0.001213 **
## EVI -2.610e+00 1.135e-01 -22.999 < 2e-16 ***
## num_odor_complaints 1.765e-02 7.066e-04 24.978 < 2e-16 ***
## daily_downwind_ref -5.012e-02 1.667e-02 -3.007 0.002644 **
## daily_temp -5.042e-03 1.411e-03 -3.574 0.000353 ***
## daily_hum -1.197e-02 3.847e-04 -31.115 < 2e-16 ***
## daily_precip -1.284e-01 2.729e-02 -4.704 2.57e-06 ***
## disaster 1.029e+00 4.399e-02 23.382 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.865 8.00 48.91
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.854 8.96 59.43
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.427 80.00 74.35
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 119/120
## R-sq.(adj) = 0.559 Deviance explained = 56.2%
## GCV = 0.2936 Scale est. = 0.29138 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(log_da_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_dc^2) +
## monthly_oil_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + daily_downwind_ref + daily_temp +
## daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.093e+00 1.372e-01 7.965 1.77e-15 ***
## year2021 2.027e-01 4.294e-02 4.720 2.38e-06 ***
## year2022 6.589e-02 5.685e-02 1.159 0.24653
## year2023 2.483e-01 5.967e-02 4.161 3.19e-05 ***
## weekdayMon 1.069e-01 1.656e-02 6.455 1.12e-10 ***
## weekdayTue 1.486e-01 1.651e-02 8.997 < 2e-16 ***
## weekdayWed 1.969e-01 1.650e-02 11.933 < 2e-16 ***
## weekdayThu 1.764e-01 1.651e-02 10.684 < 2e-16 ***
## weekdayFri 1.591e-01 1.649e-02 9.650 < 2e-16 ***
## weekdaySat 8.031e-02 1.652e-02 4.862 1.17e-06 ***
## ws_avg -4.308e-02 2.389e-03 -18.032 < 2e-16 ***
## I(1/dist_dc^2) 1.503e-03 2.365e-04 6.355 2.15e-10 ***
## monthly_oil_2km -1.173e-05 4.979e-06 -2.355 0.01855 *
## active_2km 1.887e-02 3.033e-03 6.220 5.08e-10 ***
## inactive_2km -7.388e-04 7.016e-03 -0.105 0.91613
## elevation -1.597e-02 3.432e-03 -4.653 3.31e-06 ***
## EVI -2.432e+00 1.149e-01 -21.159 < 2e-16 ***
## num_odor_complaints 1.953e-02 7.144e-04 27.338 < 2e-16 ***
## daily_downwind_ref -5.271e-02 1.696e-02 -3.108 0.00189 **
## daily_temp -3.894e-03 1.434e-03 -2.715 0.00663 **
## daily_hum -1.193e-02 3.913e-04 -30.489 < 2e-16 ***
## daily_precip -1.368e-01 2.777e-02 -4.925 8.53e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.765 8.00 39.74
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.653 8.89 48.04
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.455 80.00 72.57
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/119
## R-sq.(adj) = 0.543 Deviance explained = 54.7%
## GCV = 0.30403 Scale est. = 0.30175 n = 15595
Since February 2022
# Since feb 2022
summary(dm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + monthly_oil_2km + active_2km +
## num_odor_complaints + daily_downwind_wrp + daily_temp + daily_hum +
## daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.926e+00 9.735e-01 3.005 0.002662 **
## year2023 -1.214e+00 2.950e-01 -4.115 3.92e-05 ***
## weekdayMon 5.560e-01 1.546e-01 3.597 0.000324 ***
## weekdayTue 9.163e-01 1.544e-01 5.936 3.07e-09 ***
## weekdayWed 8.815e-01 1.545e-01 5.706 1.21e-08 ***
## weekdayThu 7.475e-01 1.546e-01 4.834 1.37e-06 ***
## weekdayFri 8.450e-01 1.542e-01 5.479 4.44e-08 ***
## weekdaySat 3.239e-01 1.543e-01 2.099 0.035896 *
## wd_avg 2.251e-03 5.606e-04 4.016 6.00e-05 ***
## ws_avg -2.553e-01 3.107e-02 -8.217 2.50e-16 ***
## I(1/dist_wrp^2) -1.883e-07 1.719e-05 -0.011 0.991260
## I(1/dist_ref^2) 3.867e-03 7.315e-04 5.286 1.29e-07 ***
## monthly_oil_2km 1.883e-04 4.105e-05 4.587 4.57e-06 ***
## active_2km -7.997e-02 1.417e-02 -5.642 1.75e-08 ***
## num_odor_complaints 1.415e-01 1.975e-02 7.165 8.62e-13 ***
## daily_downwind_wrp 4.322e-01 1.856e-01 2.328 0.019931 *
## daily_temp 4.145e-02 1.360e-02 3.047 0.002318 **
## daily_hum -3.255e-02 3.887e-03 -8.373 < 2e-16 ***
## daily_precip 6.498e-01 2.383e-01 2.727 0.006416 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 0.04726 8.000 0.006
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.87275 8.977 8.416
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 77.91134 80.000 3.386
## p-value
## s(as.numeric(month)) 0.363
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 114/116
## R-sq.(adj) = 0.219 Deviance explained = 23.1%
## GCV = 11.187 Scale est. = 11.009 n = 6531
Disaster Only
# Disaster only
summary(dm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.849e+02 9.013e+01 3.161 0.00161 **
## wd_avg -5.063e-01 1.611e-01 -3.143 0.00172 **
## ws_avg 4.253e+01 1.327e+01 3.206 0.00138 **
## I(1/dist_wrp^2) -9.204e-04 4.076e-04 -2.258 0.02413 *
## I(1/dist_ref^2) 1.050e-02 4.707e-03 2.231 0.02589 *
## I(1/dist_dc^2) 2.830e+00 1.205e+00 2.350 0.01895 *
## monthly_oil_2km -3.934e-02 1.377e-02 -2.858 0.00434 **
## inactive_2km 3.257e+01 1.364e+01 2.388 0.01708 *
## num_odor_complaints -8.312e+00 1.257e+00 -6.614 5.65e-11 ***
## daily_hum -1.896e+00 8.990e-01 -2.109 0.03519 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.359 8.855 3.374
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.202 80.000 8.513
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 0.000459 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 96/99
## R-sq.(adj) = 0.432 Deviance explained = 46.9%
## GCV = 2.8585e+05 Scale est. = 2.6709e+05 n = 1273
Exclude Disaster
# Exclude disaster
summary(dm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + EVI + num_odor_complaints +
## daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.003e+00 9.042e-01 3.321 0.000899 ***
## year2021 -2.472e-01 3.861e-01 -0.640 0.522146
## year2022 3.457e-01 5.820e-01 0.594 0.552551
## year2023 1.041e-01 5.522e-01 0.188 0.850526
## weekdayMon 5.609e-01 1.318e-01 4.255 2.10e-05 ***
## weekdayTue 9.562e-01 1.312e-01 7.286 3.37e-13 ***
## weekdayWed 8.571e-01 1.312e-01 6.533 6.68e-11 ***
## weekdayThu 7.787e-01 1.313e-01 5.932 3.05e-09 ***
## weekdayFri 7.438e-01 1.313e-01 5.665 1.50e-08 ***
## weekdaySat 3.121e-01 1.314e-01 2.375 0.017559 *
## wd_avg 1.151e-03 4.524e-04 2.545 0.010934 *
## ws_avg -1.008e-01 1.799e-02 -5.602 2.15e-08 ***
## I(1/dist_wrp^2) 3.888e-06 6.621e-07 5.873 4.38e-09 ***
## I(1/dist_ref^2) 2.305e-05 2.477e-06 9.306 < 2e-16 ***
## EVI -3.008e+00 3.765e-01 -7.988 1.48e-15 ***
## num_odor_complaints 7.779e-02 1.000e-02 7.779 7.82e-15 ***
## daily_temp 2.687e-02 1.176e-02 2.285 0.022318 *
## daily_hum -3.101e-02 3.231e-03 -9.599 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 6.744 8 5.513
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 13.385
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 61.906 80 4.086
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 113/115
## R-sq.(adj) = 0.123 Deviance explained = 12.9%
## GCV = 17.651 Scale est. = 17.536 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(dm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + wd_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.256e+02 1.319e+02 -7.018 2.34e-12 ***
## year2021 8.862e+01 1.774e+01 4.995 5.96e-07 ***
## year2022 1.069e+02 2.292e+01 4.667 3.09e-06 ***
## year2023 7.281e+01 2.119e+01 3.437 0.000590 ***
## wd_avg -8.471e-02 1.865e-02 -4.542 5.62e-06 ***
## I(1/dist_wrp^2) -1.955e-04 4.146e-05 -4.715 2.44e-06 ***
## I(1/dist_ref^2) -5.882e-03 1.793e-03 -3.280 0.001041 **
## I(1/dist_dc^2) 8.348e-01 1.540e-01 5.421 6.00e-08 ***
## monthly_oil_2km 2.020e-03 1.952e-03 1.035 0.300864
## active_2km 5.716e+00 8.784e-01 6.508 7.88e-11 ***
## elevation -5.116e+00 1.018e+00 -5.027 5.04e-07 ***
## EVI -2.694e+02 4.528e+01 -5.949 2.75e-09 ***
## num_odor_complaints 5.403e+00 2.405e-01 22.461 < 2e-16 ***
## closest_wrp_capacity 1.979e+00 2.702e-01 7.323 2.55e-13 ***
## daily_downwind_ref -2.587e+01 5.710e+00 -4.531 5.91e-06 ***
## daily_downwind_wrp 8.655e+00 6.171e+00 1.403 0.160756
## daily_hum -4.561e-01 1.175e-01 -3.883 0.000104 ***
## disaster 1.081e+02 1.469e+01 7.359 1.95e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.304 8.000 4.641
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.912 8.976 7.470
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.237 80.000 8.796
## p-value
## s(as.numeric(month)) 1.27e-06 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 112/115
## R-sq.(adj) = 0.124 Deviance explained = 12.9%
## GCV = 34218 Scale est. = 33986 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(dm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.412e+02 1.290e+02 -5.746 9.32e-09 ***
## year2021 1.055e+01 1.424e+01 0.741 0.4586
## year2022 9.823e+00 1.867e+01 0.526 0.5989
## year2023 8.893e+00 1.938e+01 0.459 0.6464
## wd_avg -8.519e-02 1.873e-02 -4.547 5.48e-06 ***
## ws_avg 1.243e+00 7.826e-01 1.588 0.1123
## I(1/dist_wrp^2) -1.791e-04 4.252e-05 -4.213 2.54e-05 ***
## I(1/dist_ref^2) -4.456e-03 1.788e-03 -2.492 0.0127 *
## I(1/dist_dc^2) 6.850e-01 1.514e-01 4.526 6.07e-06 ***
## monthly_oil_2km 2.394e-03 1.949e-03 1.229 0.2193
## active_2km 5.035e+00 8.745e-01 5.757 8.72e-09 ***
## elevation -5.010e+00 1.021e+00 -4.908 9.31e-07 ***
## EVI -2.347e+02 4.516e+01 -5.198 2.04e-07 ***
## num_odor_complaints 5.605e+00 2.395e-01 23.407 < 2e-16 ***
## closest_wrp_capacity 1.881e+00 2.690e-01 6.993 2.81e-12 ***
## daily_downwind_ref -2.669e+01 5.720e+00 -4.666 3.10e-06 ***
## daily_downwind_wrp 8.224e+00 6.182e+00 1.330 0.1834
## daily_hum -4.651e-01 1.177e-01 -3.951 7.82e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.306 8.000 5.365
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.910 8.976 7.353
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.029 80.000 8.445
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 112/115
## R-sq.(adj) = 0.121 Deviance explained = 12.6%
## GCV = 34333 Scale est. = 34102 n = 15595
Since February 2022
# Since feb 2022
summary(log_dm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km + elevation +
## EVI + num_odor_complaints + daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.492e+00 2.501e-01 9.961 < 2e-16 ***
## year2023 -2.048e-01 5.484e-02 -3.735 0.000189 ***
## weekdayMon 2.610e-01 2.900e-02 8.998 < 2e-16 ***
## weekdayTue 3.504e-01 2.896e-02 12.100 < 2e-16 ***
## weekdayWed 3.293e-01 2.899e-02 11.357 < 2e-16 ***
## weekdayThu 2.486e-01 2.901e-02 8.569 < 2e-16 ***
## weekdayFri 2.603e-01 2.893e-02 8.996 < 2e-16 ***
## weekdaySat 1.713e-01 2.896e-02 5.915 3.48e-09 ***
## wd_avg 3.861e-04 1.047e-04 3.688 0.000228 ***
## ws_avg -1.254e-01 5.460e-03 -22.974 < 2e-16 ***
## I(1/dist_ref^2) 4.423e-05 8.187e-06 5.402 6.82e-08 ***
## I(1/dist_dc^2) -1.144e-04 1.826e-04 -0.626 0.531126
## monthly_gas_2km 2.578e-05 1.841e-05 1.400 0.161627
## elevation -5.437e-02 8.928e-03 -6.090 1.20e-09 ***
## EVI -1.393e+00 1.273e-01 -10.943 < 2e-16 ***
## num_odor_complaints 2.723e-02 3.708e-03 7.344 2.33e-13 ***
## daily_temp 8.807e-03 2.523e-03 3.490 0.000486 ***
## daily_hum -1.382e-02 7.107e-04 -19.451 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 3.450e-06 8 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 25.34
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.904e+01 80 18.59
## p-value
## s(as.numeric(month)) 0.707
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 113/115
## R-sq.(adj) = 0.543 Deviance explained = 55.1%
## GCV = 0.39402 Scale est. = 0.38774 n = 6531
Disaster Only
# Disaster only
summary(log_dm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km +
## inactive_2km + elevation + num_odor_complaints + daily_temp +
## daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.788e+00 8.089e-01 8.391 < 2e-16 ***
## month11 -1.956e-01 1.658e-01 -1.180 0.238194
## month12 -4.188e-01 1.844e-01 -2.271 0.023313 *
## weekdayMon 1.455e-01 8.201e-02 1.775 0.076195 .
## weekdayTue -8.291e-02 8.401e-02 -0.987 0.323859
## weekdayWed 4.878e-02 8.157e-02 0.598 0.549949
## weekdayThu 1.501e-01 8.180e-02 1.835 0.066792 .
## weekdayFri -2.089e-01 7.977e-02 -2.619 0.008934 **
## weekdaySat -1.335e-01 8.061e-02 -1.657 0.097819 .
## wd_avg 2.471e-04 2.384e-04 1.037 0.300113
## ws_avg -1.441e-01 2.116e-02 -6.809 1.56e-11 ***
## I(1/dist_wrp^2) -3.130e-06 7.624e-07 -4.105 4.31e-05 ***
## I(1/dist_ref^2) 5.212e-05 7.485e-06 6.964 5.48e-12 ***
## I(1/dist_dc^2) 1.662e-02 2.265e-03 7.340 3.96e-13 ***
## monthly_oil_2km 4.527e-04 6.189e-05 7.314 4.74e-13 ***
## monthly_gas_2km -2.848e-03 3.507e-04 -8.119 1.17e-15 ***
## inactive_2km 8.977e-02 2.344e-02 3.830 0.000135 ***
## elevation -2.525e-01 2.570e-02 -9.826 < 2e-16 ***
## num_odor_complaints 8.459e-03 1.720e-03 4.919 9.90e-07 ***
## daily_temp 7.311e-03 8.888e-03 0.823 0.410953
## daily_hum -6.435e-03 1.733e-03 -3.712 0.000215 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.938 8.991 15.488
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 56.931 80.000 7.708
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 107/110
## R-sq.(adj) = 0.725 Deviance explained = 74.3%
## GCV = 0.621 Scale est. = 0.58009 n = 1273
Exclude Disaster
# Exclude disaster
summary(log_dm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_ref^2) + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + daily_downwind_wrp +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.085e+00 1.750e-01 6.201 5.76e-10 ***
## year2021 -3.869e-02 6.140e-02 -0.630 0.52867
## year2022 -1.362e-01 9.715e-02 -1.402 0.16104
## year2023 -1.666e-02 9.364e-02 -0.178 0.85882
## weekdayMon 2.610e-01 2.076e-02 12.570 < 2e-16 ***
## weekdayTue 3.067e-01 2.068e-02 14.835 < 2e-16 ***
## weekdayWed 3.350e-01 2.067e-02 16.210 < 2e-16 ***
## weekdayThu 2.968e-01 2.068e-02 14.351 < 2e-16 ***
## weekdayFri 2.901e-01 2.068e-02 14.024 < 2e-16 ***
## weekdaySat 1.532e-01 2.069e-02 7.401 1.43e-13 ***
## wd_avg 1.833e-04 7.184e-05 2.551 0.01075 *
## ws_avg -5.395e-02 2.933e-03 -18.392 < 2e-16 ***
## I(1/dist_ref^2) 1.829e-05 1.234e-05 1.482 0.13843
## monthly_gas_2km 4.008e-06 1.524e-05 0.263 0.79261
## active_2km 2.131e-02 3.587e-03 5.943 2.87e-09 ***
## inactive_2km -2.208e-02 8.378e-03 -2.635 0.00842 **
## elevation -2.023e-02 4.118e-03 -4.912 9.13e-07 ***
## EVI -2.105e+00 1.230e-01 -17.113 < 2e-16 ***
## num_odor_complaints 1.221e-02 1.592e-03 7.667 1.87e-14 ***
## daily_downwind_wrp 5.050e-02 2.300e-02 2.196 0.02809 *
## daily_temp 1.384e-02 1.876e-03 7.377 1.71e-13 ***
## daily_hum -1.262e-02 5.214e-04 -24.200 < 2e-16 ***
## daily_precip -1.039e-01 3.974e-02 -2.616 0.00891 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.695 8 44.12
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 34.33
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.037 80 38.84
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 119/120
## R-sq.(adj) = 0.48 Deviance explained = 48.4%
## GCV = 0.43856 Scale est. = 0.43496 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(log_dm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + elevation + EVI + num_odor_complaints +
## closest_wrp_capacity + daily_downwind_ref + daily_temp +
## daily_hum + daily_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.046e+00 2.634e-01 7.768 8.45e-15 ***
## year2021 1.117e+00 6.996e-02 15.972 < 2e-16 ***
## year2022 1.134e+00 9.109e-02 12.447 < 2e-16 ***
## year2023 8.490e-01 8.407e-02 10.098 < 2e-16 ***
## weekdayMon 2.415e-01 2.160e-02 11.180 < 2e-16 ***
## weekdayTue 2.645e-01 2.153e-02 12.285 < 2e-16 ***
## weekdayWed 3.078e-01 2.152e-02 14.305 < 2e-16 ***
## weekdayThu 2.760e-01 2.154e-02 12.817 < 2e-16 ***
## weekdayFri 2.352e-01 2.150e-02 10.939 < 2e-16 ***
## weekdaySat 1.184e-01 2.154e-02 5.495 3.98e-08 ***
## ws_avg -5.286e-02 3.113e-03 -16.983 < 2e-16 ***
## I(1/dist_wrp^2) -4.773e-01 2.916e-02 -16.369 < 2e-16 ***
## I(1/dist_ref^2) 3.885e-04 2.803e-05 13.858 < 2e-16 ***
## I(1/dist_dc^2) 4.243e+03 2.592e+02 16.369 < 2e-16 ***
## elevation -1.904e-02 3.851e-03 -4.943 7.76e-07 ***
## EVI -1.362e+00 7.380e-02 -18.452 < 2e-16 ***
## num_odor_complaints 2.397e-02 9.373e-04 25.574 < 2e-16 ***
## closest_wrp_capacity -3.557e-03 3.529e-04 -10.078 < 2e-16 ***
## daily_downwind_ref 1.444e-02 2.221e-02 0.650 0.5157
## daily_temp -1.663e-03 1.870e-03 -0.889 0.3740
## daily_hum -1.380e-02 5.102e-04 -27.052 < 2e-16 ***
## daily_precip -9.858e-02 3.622e-02 -2.722 0.0065 **
## disaster 1.340e+00 5.767e-02 23.232 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.837 8.000 47.24
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.732 8.937 39.83
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.984 80.000 32.38
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/120
## R-sq.(adj) = 0.508 Deviance explained = 51.2%
## GCV = 0.51707 Scale est. = 0.5132 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(log_dm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.800e+00 2.568e-01 14.797 < 2e-16 ***
## year2021 1.321e-01 5.664e-02 2.332 0.01971 *
## year2022 -1.082e-01 7.504e-02 -1.442 0.14929
## year2023 4.617e-02 7.793e-02 0.592 0.55354
## weekdayMon 2.420e-01 2.199e-02 11.008 < 2e-16 ***
## weekdayTue 2.658e-01 2.191e-02 12.133 < 2e-16 ***
## weekdayWed 3.094e-01 2.189e-02 14.135 < 2e-16 ***
## weekdayThu 2.757e-01 2.191e-02 12.584 < 2e-16 ***
## weekdayFri 2.378e-01 2.187e-02 10.871 < 2e-16 ***
## weekdaySat 1.210e-01 2.192e-02 5.522 3.40e-08 ***
## wd_avg 9.051e-05 7.467e-05 1.212 0.22553
## ws_avg -5.582e-02 3.177e-03 -17.569 < 2e-16 ***
## I(1/dist_wrp^2) -4.951e-01 2.969e-02 -16.679 < 2e-16 ***
## I(1/dist_ref^2) -5.392e-02 3.235e-03 -16.668 < 2e-16 ***
## I(1/dist_dc^2) 4.393e+03 2.634e+02 16.679 < 2e-16 ***
## elevation -2.286e-02 3.899e-03 -5.863 4.63e-09 ***
## EVI -1.384e+00 7.443e-02 -18.595 < 2e-16 ***
## num_odor_complaints 2.642e-02 9.473e-04 27.893 < 2e-16 ***
## closest_wrp_capacity -3.463e-03 3.563e-04 -9.720 < 2e-16 ***
## daily_downwind_ref 9.056e-03 2.261e-02 0.400 0.68880
## daily_temp -2.244e-04 1.908e-03 -0.118 0.90640
## daily_hum -1.376e-02 5.219e-04 -26.365 < 2e-16 ***
## daily_precip -1.042e-01 3.700e-02 -2.815 0.00488 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.79 8.000 34.03
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.52 8.848 32.60
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.68 80.000 31.48
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/120
## R-sq.(adj) = 0.491 Deviance explained = 49.4%
## GCV = 0.53522 Scale est. = 0.53124 n = 15595
Since February 2022
# Since feb 2022
summary(ha_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.725e+00 1.545e-01 37.063 < 2e-16 ***
## year2023 -1.359e-01 1.555e-02 -8.744 < 2e-16 ***
## weekdayMon 7.473e-02 5.572e-03 13.412 < 2e-16 ***
## weekdayTue 1.400e-01 5.523e-03 25.345 < 2e-16 ***
## weekdayWed 1.484e-01 5.536e-03 26.810 < 2e-16 ***
## weekdayThu 1.079e-01 5.540e-03 19.477 < 2e-16 ***
## weekdayFri 1.246e-01 5.519e-03 22.581 < 2e-16 ***
## weekdaySat 6.291e-02 5.504e-03 11.431 < 2e-16 ***
## wd_avg -2.020e-04 1.771e-05 -11.406 < 2e-16 ***
## ws_avg -4.966e-02 6.923e-04 -71.727 < 2e-16 ***
## I(1/dist_wrp^2) 1.073e-06 6.572e-08 16.328 < 2e-16 ***
## I(1/dist_ref^2) 9.215e-06 7.250e-07 12.709 < 2e-16 ***
## I(1/dist_dc^2) -4.244e-04 1.158e-05 -36.634 < 2e-16 ***
## monthly_oil_2km 7.496e-06 2.163e-06 3.465 0.00053 ***
## monthly_gas_2km -3.754e-05 6.336e-06 -5.925 3.14e-09 ***
## active_2km 1.434e-02 1.733e-03 8.273 < 2e-16 ***
## inactive_2km -3.426e-02 4.717e-03 -7.264 3.78e-13 ***
## elevation -3.074e-02 3.480e-03 -8.835 < 2e-16 ***
## EVI -1.491e+00 7.683e-02 -19.410 < 2e-16 ***
## num_odor_complaints 1.309e-02 7.212e-04 18.150 < 2e-16 ***
## closest_wrp_capacity -2.882e-03 4.003e-04 -7.201 6.03e-13 ***
## hourly_downwind_ref -2.759e-02 5.412e-03 -5.097 3.46e-07 ***
## hourly_downwind_wrp 1.955e-02 6.256e-03 3.125 0.00178 **
## hourly_temp -3.226e-02 3.315e-04 -97.325 < 2e-16 ***
## hourly_hum -9.507e-03 1.176e-04 -80.843 < 2e-16 ***
## hourly_precip 4.669e-01 1.008e-01 4.632 3.62e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.863 8.00 108.1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.260 8.26 184.5
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.990 80.00 289.6
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 119/123
## R-sq.(adj) = 0.351 Deviance explained = 35.2%
## GCV = 0.33385 Scale est. = 0.33359 n = 153718
Disaster Only
# Disaster only
summary(ha_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month +
## weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) +
## I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_wrp + hourly_temp + hourly_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.439e+02 2.504e+02 1.373 0.169667
## month11 -5.068e+00 5.033e+00 -1.007 0.313944
## month12 -1.720e+00 5.452e+00 -0.316 0.752358
## weekdayMon -3.528e+00 2.586e+00 -1.364 0.172618
## weekdayTue -6.387e+00 2.595e+00 -2.461 0.013857 *
## weekdayWed -1.574e+00 2.562e+00 -0.615 0.538825
## weekdayThu 1.794e+00 2.558e+00 0.701 0.483077
## weekdayFri 2.918e+00 2.502e+00 1.166 0.243428
## weekdaySat 2.057e+00 2.531e+00 0.813 0.416351
## wd_avg -2.871e-02 6.943e-03 -4.135 3.56e-05 ***
## ws_avg 1.454e+00 3.702e-01 3.929 8.55e-05 ***
## I(1/dist_wrp^2) -6.655e-04 1.696e-04 -3.925 8.71e-05 ***
## I(1/dist_ref^2) -1.667e-03 2.781e-03 -0.599 0.548960
## I(1/dist_dc^2) 5.893e+00 1.665e+00 3.539 0.000402 ***
## monthly_oil_2km -1.799e-02 8.448e-03 -2.130 0.033219 *
## monthly_gas_2km -2.634e-02 2.249e-02 -1.172 0.241372
## active_2km -1.138e+01 1.964e+00 -5.795 6.91e-09 ***
## inactive_2km 7.207e+01 6.829e+00 10.554 < 2e-16 ***
## elevation -3.683e+01 1.824e+00 -20.188 < 2e-16 ***
## EVI -6.007e+02 4.246e+01 -14.146 < 2e-16 ***
## num_odor_complaints -1.253e+00 6.419e-02 -19.512 < 2e-16 ***
## closest_wrp_capacity 1.175e+00 4.648e-01 2.527 0.011497 *
## hourly_downwind_wrp 7.409e+00 2.649e+00 2.797 0.005164 **
## hourly_temp -8.394e-01 1.383e-01 -6.071 1.29e-09 ***
## hourly_hum -1.086e-01 4.156e-02 -2.613 0.008980 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.707 8.932 83.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.390 80.000 63.25
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.21 Deviance explained = 21.3%
## GCV = 13802 Scale est. = 13753 n = 30242
Exclude Disaster
# Exclude disaster
summary(ha_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km +
## active_2km + inactive_2km + elevation + EVI + num_odor_complaints +
## closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.693e+00 1.197e-01 39.205 < 2e-16 ***
## year2021 5.288e-02 1.425e-02 3.710 0.000207 ***
## year2022 6.716e-01 2.260e-02 29.721 < 2e-16 ***
## year2023 5.851e-01 2.185e-02 26.781 < 2e-16 ***
## weekdayMon 8.528e-02 4.864e-03 17.533 < 2e-16 ***
## weekdayTue 1.443e-01 4.824e-03 29.916 < 2e-16 ***
## weekdayWed 1.673e-01 4.819e-03 34.710 < 2e-16 ***
## weekdayThu 1.531e-01 4.818e-03 31.776 < 2e-16 ***
## weekdayFri 1.415e-01 4.820e-03 29.355 < 2e-16 ***
## weekdaySat 6.738e-02 4.811e-03 14.006 < 2e-16 ***
## wd_avg -3.936e-04 1.529e-05 -25.741 < 2e-16 ***
## ws_avg -3.184e-02 5.121e-04 -62.177 < 2e-16 ***
## I(1/dist_wrp^2) 3.672e-07 1.137e-07 3.230 0.001236 **
## I(1/dist_ref^2) 1.079e-06 5.761e-06 0.187 0.851432
## I(1/dist_dc^2) 7.839e-04 7.907e-05 9.913 < 2e-16 ***
## monthly_gas_2km -4.583e-05 3.725e-06 -12.303 < 2e-16 ***
## active_2km 1.242e-02 8.628e-04 14.400 < 2e-16 ***
## inactive_2km -1.488e-02 2.657e-03 -5.602 2.12e-08 ***
## elevation -8.347e-03 1.114e-03 -7.493 6.74e-14 ***
## EVI -1.439e+00 4.685e-02 -30.713 < 2e-16 ***
## num_odor_complaints 5.676e-03 3.750e-04 15.137 < 2e-16 ***
## closest_wrp_capacity -2.247e-03 2.699e-04 -8.326 < 2e-16 ***
## hourly_downwind_ref -5.162e-02 4.624e-03 -11.164 < 2e-16 ***
## hourly_downwind_wrp 2.443e-02 5.112e-03 4.779 1.76e-06 ***
## hourly_temp -3.371e-02 2.802e-04 -120.339 < 2e-16 ***
## hourly_hum -9.323e-03 9.589e-05 -97.234 < 2e-16 ***
## hourly_precip -4.115e-01 1.143e-01 -3.599 0.000319 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.982 8 589.0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 379.2
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.922 80 404.6
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/124
## R-sq.(adj) = 0.248 Deviance explained = 24.8%
## GCV = 0.55993 Scale est. = 0.55973 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(ha_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip +
## disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.655e+02 6.608e+00 -25.044 < 2e-16 ***
## year2021 9.738e+00 7.510e-01 12.967 < 2e-16 ***
## year2022 1.244e+01 9.733e-01 12.786 < 2e-16 ***
## year2023 7.386e+00 9.119e-01 8.100 5.51e-16 ***
## weekdayMon -7.005e-01 2.303e-01 -3.041 0.00236 **
## weekdayTue -6.643e-01 2.285e-01 -2.907 0.00365 **
## weekdayWed -3.712e-01 2.282e-01 -1.626 0.10390
## weekdayThu -1.339e-01 2.282e-01 -0.587 0.55732
## weekdayFri 1.326e-01 2.279e-01 0.582 0.56076
## weekdaySat 6.125e-02 2.278e-01 0.269 0.78807
## wd_avg -7.282e-03 7.130e-04 -10.214 < 2e-16 ***
## ws_avg 1.617e-01 2.459e-02 6.575 4.86e-11 ***
## I(1/dist_wrp^2) 4.612e-06 4.095e-06 1.126 0.26007
## I(1/dist_ref^2) -3.239e-03 2.064e-04 -15.691 < 2e-16 ***
## I(1/dist_dc^2) 2.534e-01 1.217e-02 20.815 < 2e-16 ***
## monthly_oil_2km 3.964e-04 8.839e-05 4.485 7.30e-06 ***
## monthly_gas_2km 5.471e-05 1.889e-04 0.290 0.77207
## active_2km 3.530e-01 4.227e-02 8.349 < 2e-16 ***
## inactive_2km 2.244e+00 1.188e-01 18.885 < 2e-16 ***
## elevation -1.248e+00 5.320e-02 -23.453 < 2e-16 ***
## EVI -5.567e+01 2.181e+00 -25.528 < 2e-16 ***
## num_odor_complaints 5.717e-01 1.004e-02 56.940 < 2e-16 ***
## closest_wrp_capacity 4.143e-01 1.451e-02 28.545 < 2e-16 ***
## hourly_downwind_ref -2.019e+00 2.153e-01 -9.378 < 2e-16 ***
## hourly_downwind_wrp 1.544e+00 2.415e-01 6.394 1.62e-10 ***
## hourly_temp -2.308e-01 1.278e-02 -18.062 < 2e-16 ***
## hourly_hum -5.509e-02 4.338e-03 -12.699 < 2e-16 ***
## hourly_precip -3.925e-01 4.986e+00 -0.079 0.93725
## disaster 1.251e+01 6.201e-01 20.177 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.933 8 45.69
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 93.50
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.186 80 78.08
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 123/126
## R-sq.(adj) = 0.0486 Deviance explained = 4.89%
## GCV = 1367.9 Scale est. = 1367.5 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(ha_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.493e+02 6.542e+00 -22.826 < 2e-16 ***
## year2021 3.942e-01 5.909e-01 0.667 0.50461
## year2022 7.027e-01 7.793e-01 0.902 0.36717
## year2023 -4.345e-01 8.210e-01 -0.529 0.59662
## weekdayMon -7.023e-01 2.305e-01 -3.047 0.00231 **
## weekdayTue -6.474e-01 2.286e-01 -2.832 0.00463 **
## weekdayWed -3.577e-01 2.284e-01 -1.566 0.11723
## weekdayThu -1.384e-01 2.283e-01 -0.606 0.54425
## weekdayFri 1.508e-01 2.281e-01 0.661 0.50841
## weekdaySat 8.398e-02 2.280e-01 0.368 0.71257
## wd_avg -7.299e-03 7.134e-04 -10.233 < 2e-16 ***
## ws_avg 1.466e-01 2.459e-02 5.961 2.51e-09 ***
## I(1/dist_wrp^2) 5.719e-06 3.854e-06 1.484 0.13783
## I(1/dist_ref^2) -3.114e-03 1.960e-04 -15.884 < 2e-16 ***
## I(1/dist_dc^2) 2.378e-01 1.234e-02 19.263 < 2e-16 ***
## monthly_oil_2km 4.562e-04 8.775e-05 5.198 2.01e-07 ***
## monthly_gas_2km 1.570e-05 1.810e-04 0.087 0.93090
## active_2km 2.278e-01 4.164e-02 5.470 4.50e-08 ***
## inactive_2km 2.496e+00 1.173e-01 21.285 < 2e-16 ***
## elevation -1.319e+00 5.301e-02 -24.877 < 2e-16 ***
## EVI -5.419e+01 2.179e+00 -24.870 < 2e-16 ***
## num_odor_complaints 5.945e-01 9.984e-03 59.542 < 2e-16 ***
## closest_wrp_capacity 4.191e-01 1.447e-02 28.968 < 2e-16 ***
## hourly_downwind_ref -2.047e+00 2.154e-01 -9.502 < 2e-16 ***
## hourly_downwind_wrp 1.579e+00 2.416e-01 6.535 6.37e-11 ***
## hourly_temp -2.244e-01 1.278e-02 -17.556 < 2e-16 ***
## hourly_hum -5.482e-02 4.341e-03 -12.628 < 2e-16 ***
## hourly_precip -1.161e+00 4.988e+00 -0.233 0.81593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.937 8 55.26
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 91.91
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 77.563 80 76.00
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.0475 Deviance explained = 4.79%
## GCV = 1369.4 Scale est. = 1369 n = 367838
Since February 2022
# Since feb 2022
summary(log_ha_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.336e+00 5.911e-02 56.447 < 2e-16 ***
## year2023 -9.704e-02 1.480e-02 -6.558 5.47e-11 ***
## weekdayMon 8.902e-02 5.302e-03 16.790 < 2e-16 ***
## weekdayTue 1.568e-01 5.255e-03 29.833 < 2e-16 ***
## weekdayWed 1.516e-01 5.268e-03 28.781 < 2e-16 ***
## weekdayThu 1.085e-01 5.272e-03 20.578 < 2e-16 ***
## weekdayFri 1.261e-01 5.252e-03 24.006 < 2e-16 ***
## weekdaySat 7.496e-02 5.237e-03 14.313 < 2e-16 ***
## wd_avg -3.786e-04 1.685e-05 -22.469 < 2e-16 ***
## ws_avg -6.109e-02 6.588e-04 -92.737 < 2e-16 ***
## I(1/dist_wrp^2) 1.258e-06 6.667e-08 18.867 < 2e-16 ***
## I(1/dist_ref^2) -1.235e-05 1.520e-06 -8.126 4.48e-16 ***
## I(1/dist_dc^2) -6.728e-04 2.313e-05 -29.090 < 2e-16 ***
## monthly_oil_2km -1.150e-06 2.058e-06 -0.559 0.5764
## monthly_gas_2km -4.434e-05 6.029e-06 -7.354 1.93e-13 ***
## active_2km 1.721e-02 1.650e-03 10.432 < 2e-16 ***
## inactive_2km -3.752e-03 3.427e-03 -1.095 0.2736
## elevation -4.561e-02 2.707e-03 -16.846 < 2e-16 ***
## EVI -2.375e+00 6.238e-02 -38.078 < 2e-16 ***
## num_odor_complaints 1.326e-02 6.863e-04 19.327 < 2e-16 ***
## hourly_downwind_ref -6.484e-03 5.150e-03 -1.259 0.2081
## hourly_downwind_wrp 4.752e-02 5.953e-03 7.983 1.44e-15 ***
## hourly_temp -3.135e-02 3.154e-04 -99.379 < 2e-16 ***
## hourly_hum -9.800e-03 1.119e-04 -87.564 < 2e-16 ***
## hourly_precip 2.371e-01 9.590e-02 2.473 0.0134 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.878 8 144.2
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 335.7
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.994 80 843.7
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 119/122
## R-sq.(adj) = 0.547 Deviance explained = 54.7%
## GCV = 0.30229 Scale est. = 0.30206 n = 153718
Disaster Only
# Disaster only
summary(log_ha_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.338e+01 1.863e+00 -12.550 < 2e-16 ***
## month11 1.553e-01 3.505e-02 4.431 9.39e-06 ***
## month12 -1.110e-01 3.620e-02 -3.065 0.00218 **
## weekdayMon 2.516e-02 1.808e-02 1.392 0.16397
## weekdayTue -9.922e-02 1.814e-02 -5.470 4.54e-08 ***
## weekdayWed 1.126e-01 1.791e-02 6.291 3.20e-10 ***
## weekdayThu 9.509e-02 1.806e-02 5.265 1.41e-07 ***
## weekdayFri 3.090e-02 1.749e-02 1.767 0.07732 .
## weekdaySat -8.340e-02 1.769e-02 -4.714 2.44e-06 ***
## wd_avg -7.720e-04 4.893e-05 -15.778 < 2e-16 ***
## ws_avg -1.110e-01 2.666e-03 -41.647 < 2e-16 ***
## I(1/dist_wrp^2) -4.535e-05 3.437e-06 -13.194 < 2e-16 ***
## I(1/dist_ref^2) -2.659e-04 1.972e-05 -13.487 < 2e-16 ***
## I(1/dist_dc^2) 3.288e-01 2.425e-02 13.559 < 2e-16 ***
## monthly_oil_2km 6.284e-04 4.022e-05 15.627 < 2e-16 ***
## active_2km -3.871e-01 1.253e-02 -30.882 < 2e-16 ***
## inactive_2km 6.700e-01 4.309e-02 15.551 < 2e-16 ***
## elevation -2.089e-01 1.147e-02 -18.209 < 2e-16 ***
## num_odor_complaints 4.279e-03 4.510e-04 9.489 < 2e-16 ***
## closest_wrp_capacity 6.845e-02 3.546e-03 19.302 < 2e-16 ***
## hourly_downwind_ref -1.641e-01 1.466e-02 -11.197 < 2e-16 ***
## hourly_downwind_wrp 2.034e-01 1.867e-02 10.898 < 2e-16 ***
## hourly_temp -3.594e-02 9.671e-04 -37.166 < 2e-16 ***
## hourly_hum -1.034e-02 2.939e-04 -35.177 < 2e-16 ***
## hourly_precip 1.048e+00 2.535e-01 4.134 3.57e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 250.62
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.37 80 88.83
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.583 Deviance explained = 58.5%
## GCV = 0.67426 Scale est. = 0.6718 n = 30242
Exclude Disaster
# Exclude disaster
summary(log_ha_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.438e+00 1.212e-01 28.361 < 2e-16 ***
## year2021 1.177e-01 1.190e-02 9.893 < 2e-16 ***
## year2022 5.850e-01 1.884e-02 31.048 < 2e-16 ***
## year2023 6.066e-01 1.822e-02 33.297 < 2e-16 ***
## weekdayMon 9.370e-02 4.054e-03 23.110 < 2e-16 ***
## weekdayTue 1.468e-01 4.021e-03 36.515 < 2e-16 ***
## weekdayWed 1.700e-01 4.017e-03 42.329 < 2e-16 ***
## weekdayThu 1.546e-01 4.016e-03 38.487 < 2e-16 ***
## weekdayFri 1.446e-01 4.018e-03 35.979 < 2e-16 ***
## weekdaySat 7.364e-02 4.010e-03 18.365 < 2e-16 ***
## wd_avg -6.106e-04 1.275e-05 -47.905 < 2e-16 ***
## ws_avg -3.803e-02 4.269e-04 -89.094 < 2e-16 ***
## I(1/dist_wrp^2) 2.227e-06 1.127e-07 19.759 < 2e-16 ***
## I(1/dist_ref^2) -9.849e-05 4.990e-06 -19.737 < 2e-16 ***
## I(1/dist_dc^2) 1.579e-03 1.718e-04 9.192 < 2e-16 ***
## monthly_oil_2km -7.810e-06 1.655e-06 -4.719 2.37e-06 ***
## monthly_gas_2km -6.010e-05 3.268e-06 -18.391 < 2e-16 ***
## active_2km 2.310e-02 7.677e-04 30.086 < 2e-16 ***
## inactive_2km -1.921e-02 2.228e-03 -8.624 < 2e-16 ***
## elevation -2.088e-03 9.309e-04 -2.243 0.0249 *
## EVI -2.289e+00 3.907e-02 -58.570 < 2e-16 ***
## num_odor_complaints 2.732e-03 3.126e-04 8.740 < 2e-16 ***
## closest_wrp_capacity -2.991e-03 2.680e-04 -11.159 < 2e-16 ***
## hourly_downwind_ref -3.551e-02 3.854e-03 -9.212 < 2e-16 ***
## hourly_downwind_wrp 6.536e-02 4.262e-03 15.337 < 2e-16 ***
## hourly_temp -3.147e-02 2.335e-04 -134.734 < 2e-16 ***
## hourly_hum -8.700e-03 7.993e-05 -108.836 < 2e-16 ***
## hourly_precip -7.798e-01 9.530e-02 -8.183 2.77e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.997 8 1039
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 1335
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.987 80 1297
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.444 Deviance explained = 44.4%
## GCV = 0.38907 Scale est. = 0.38893 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(log_ha_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## active_2km + inactive_2km + elevation + EVI + num_odor_complaints +
## closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.067e-02 1.212e-01 -0.171 0.86458
## year2021 8.026e-01 1.377e-02 58.302 < 2e-16 ***
## year2022 9.504e-01 1.786e-02 53.227 < 2e-16 ***
## year2023 8.091e-01 1.674e-02 48.335 < 2e-16 ***
## weekdayMon 8.113e-02 4.203e-03 19.304 < 2e-16 ***
## weekdayTue 1.188e-01 4.169e-03 28.483 < 2e-16 ***
## weekdayWed 1.595e-01 4.164e-03 38.294 < 2e-16 ***
## weekdayThu 1.431e-01 4.163e-03 34.359 < 2e-16 ***
## weekdayFri 1.285e-01 4.159e-03 30.896 < 2e-16 ***
## weekdaySat 5.813e-02 4.157e-03 13.983 < 2e-16 ***
## wd_avg -6.860e-04 1.301e-05 -52.730 < 2e-16 ***
## ws_avg -4.032e-02 4.487e-04 -89.855 < 2e-16 ***
## I(1/dist_wrp^2) 1.401e-06 9.995e-08 14.014 < 2e-16 ***
## I(1/dist_ref^2) -1.294e-04 4.461e-06 -29.013 < 2e-16 ***
## I(1/dist_dc^2) 8.653e-03 2.466e-04 35.091 < 2e-16 ***
## monthly_oil_2km -5.838e-06 1.559e-06 -3.745 0.00018 ***
## active_2km 3.100e-02 7.878e-04 39.344 < 2e-16 ***
## inactive_2km 1.274e-03 2.267e-03 0.562 0.57408
## elevation -2.284e-02 9.736e-04 -23.459 < 2e-16 ***
## EVI -3.169e+00 3.971e-02 -79.798 < 2e-16 ***
## num_odor_complaints 1.141e-02 1.833e-04 62.257 < 2e-16 ***
## closest_wrp_capacity 3.900e-03 2.660e-04 14.661 < 2e-16 ***
## hourly_downwind_ref -7.036e-02 3.929e-03 -17.906 < 2e-16 ***
## hourly_downwind_wrp 8.627e-02 4.403e-03 19.591 < 2e-16 ***
## hourly_temp -3.407e-02 2.332e-04 -146.077 < 2e-16 ***
## hourly_hum -9.317e-03 7.917e-05 -117.683 < 2e-16 ***
## hourly_precip -8.818e-01 9.098e-02 -9.692 < 2e-16 ***
## disaster 7.991e-01 1.135e-02 70.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.971 8 570.4
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.996 9 851.2
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.931 80 1225.0
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.443 Deviance explained = 44.3%
## GCV = 0.45541 Scale est. = 0.45526 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(log_ha_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## active_2km + inactive_2km + elevation + EVI + num_odor_complaints +
## closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.748e-01 1.212e-01 8.042 8.83e-16 ***
## year2021 2.031e-01 1.089e-02 18.655 < 2e-16 ***
## year2022 1.974e-01 1.439e-02 13.718 < 2e-16 ***
## year2023 3.022e-01 1.521e-02 19.865 < 2e-16 ***
## weekdayMon 8.100e-02 4.231e-03 19.144 < 2e-16 ***
## weekdayTue 1.198e-01 4.197e-03 28.542 < 2e-16 ***
## weekdayWed 1.603e-01 4.192e-03 38.239 < 2e-16 ***
## weekdayThu 1.428e-01 4.191e-03 34.058 < 2e-16 ***
## weekdayFri 1.296e-01 4.187e-03 30.966 < 2e-16 ***
## weekdaySat 5.958e-02 4.185e-03 14.237 < 2e-16 ***
## wd_avg -6.873e-04 1.310e-05 -52.484 < 2e-16 ***
## ws_avg -4.127e-02 4.516e-04 -91.394 < 2e-16 ***
## I(1/dist_wrp^2) 1.724e-06 9.896e-08 17.427 < 2e-16 ***
## I(1/dist_ref^2) -1.317e-04 4.499e-06 -29.269 < 2e-16 ***
## I(1/dist_dc^2) 7.371e-03 2.379e-04 30.983 < 2e-16 ***
## monthly_oil_2km -2.352e-06 1.568e-06 -1.500 0.134
## active_2km 2.248e-02 7.841e-04 28.664 < 2e-16 ***
## inactive_2km 1.950e-02 2.270e-03 8.592 < 2e-16 ***
## elevation -2.771e-02 9.777e-04 -28.342 < 2e-16 ***
## EVI -3.077e+00 3.995e-02 -77.014 < 2e-16 ***
## num_odor_complaints 1.285e-02 1.833e-04 70.093 < 2e-16 ***
## closest_wrp_capacity 4.307e-03 2.678e-04 16.082 < 2e-16 ***
## hourly_downwind_ref -7.210e-02 3.956e-03 -18.228 < 2e-16 ***
## hourly_downwind_wrp 8.823e-02 4.433e-03 19.904 < 2e-16 ***
## hourly_temp -3.366e-02 2.347e-04 -143.410 < 2e-16 ***
## hourly_hum -9.297e-03 7.970e-05 -116.645 < 2e-16 ***
## hourly_precip -9.332e-01 9.158e-02 -10.190 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.951 8 651.1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.997 9 775.6
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.952 80 1219.0
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/124
## R-sq.(adj) = 0.435 Deviance explained = 43.6%
## GCV = 0.46155 Scale est. = 0.4614 n = 367838
Since February 2022
# Since feb 2022
summary(hm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + inactive_2km + elevation + EVI + num_odor_complaints +
## closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.120e+00 3.168e-01 22.474 < 2e-16 ***
## year2023 -1.849e-01 3.178e-02 -5.817 5.99e-09 ***
## weekdayMon 1.227e-01 1.143e-02 10.736 < 2e-16 ***
## weekdayTue 2.126e-01 1.133e-02 18.771 < 2e-16 ***
## weekdayWed 2.206e-01 1.136e-02 19.423 < 2e-16 ***
## weekdayThu 1.692e-01 1.136e-02 14.891 < 2e-16 ***
## weekdayFri 1.987e-01 1.132e-02 17.547 < 2e-16 ***
## weekdaySat 8.991e-02 1.129e-02 7.964 1.68e-15 ***
## wd_avg -6.799e-05 3.632e-05 -1.872 0.061225 .
## ws_avg -6.652e-02 1.420e-03 -46.847 < 2e-16 ***
## I(1/dist_wrp^2) 7.978e-07 1.345e-07 5.932 3.00e-09 ***
## I(1/dist_ref^2) 2.187e-05 1.478e-06 14.797 < 2e-16 ***
## I(1/dist_dc^2) -4.692e-04 2.367e-05 -19.828 < 2e-16 ***
## monthly_oil_2km 1.217e-05 4.157e-06 2.928 0.003411 **
## monthly_gas_2km 2.136e-06 1.268e-05 0.168 0.866236
## inactive_2km -1.685e-02 8.289e-03 -2.033 0.042025 *
## elevation -4.174e-02 7.125e-03 -5.857 4.71e-09 ***
## EVI -1.478e+00 1.340e-01 -11.028 < 2e-16 ***
## num_odor_complaints 3.373e-02 1.479e-03 22.803 < 2e-16 ***
## closest_wrp_capacity -3.793e-03 8.208e-04 -4.621 3.82e-06 ***
## hourly_downwind_ref -4.602e-02 1.110e-02 -4.145 3.41e-05 ***
## hourly_downwind_wrp 4.670e-02 1.283e-02 3.640 0.000273 ***
## hourly_temp -3.803e-02 6.784e-04 -56.060 < 2e-16 ***
## hourly_hum -1.131e-02 2.406e-04 -46.995 < 2e-16 ***
## hourly_precip 1.131e+00 2.067e-01 5.469 4.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.659 8.00 35.07
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.260 8.26 96.92
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.974 80.00 96.24
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 118/122
## R-sq.(adj) = 0.175 Deviance explained = 17.5%
## GCV = 1.4048 Scale est. = 1.4037 n = 153718
Disaster Only
# Disaster only
summary(hm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month +
## weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) +
## I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_wrp + hourly_temp + hourly_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.751e+02 5.365e+02 1.445 0.148540
## month11 -1.138e+01 1.069e+01 -1.065 0.286956
## month12 -6.246e-01 1.157e+01 -0.054 0.956966
## weekdayMon -4.502e+00 5.492e+00 -0.820 0.412317
## weekdayTue -1.019e+01 5.510e+00 -1.850 0.064385 .
## weekdayWed -7.691e-01 5.439e+00 -0.141 0.887550
## weekdayThu 5.550e+00 5.432e+00 1.022 0.306954
## weekdayFri 9.091e+00 5.312e+00 1.711 0.087021 .
## weekdaySat 7.882e+00 5.374e+00 1.467 0.142518
## wd_avg -7.400e-02 1.474e-02 -5.020 5.21e-07 ***
## ws_avg 3.090e+00 7.859e-01 3.932 8.45e-05 ***
## I(1/dist_wrp^2) -1.485e-03 3.691e-04 -4.023 5.76e-05 ***
## I(1/dist_ref^2) -3.664e-03 6.026e-03 -0.608 0.543217
## I(1/dist_dc^2) 1.315e+01 3.620e+00 3.632 0.000282 ***
## monthly_oil_2km -3.991e-02 1.803e-02 -2.213 0.026905 *
## monthly_gas_2km -6.194e-02 4.771e-02 -1.298 0.194215
## active_2km -2.612e+01 4.175e+00 -6.257 3.97e-10 ***
## inactive_2km 1.629e+02 1.456e+01 11.184 < 2e-16 ***
## elevation -8.281e+01 3.893e+00 -21.274 < 2e-16 ***
## EVI -1.337e+03 9.018e+01 -14.826 < 2e-16 ***
## num_odor_complaints -2.810e+00 1.361e-01 -20.642 < 2e-16 ***
## closest_wrp_capacity 2.611e+00 9.956e-01 2.623 0.008732 **
## hourly_downwind_wrp 1.212e+01 5.625e+00 2.155 0.031166 *
## hourly_temp -1.725e+00 2.936e-01 -5.876 4.25e-09 ***
## hourly_hum -2.430e-01 8.825e-02 -2.754 0.005899 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.723 8.939 92.02
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.248 80.000 68.80
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.223 Deviance explained = 22.6%
## GCV = 62223 Scale est. = 61999 n = 30242
Exclude Disaster
# Exclude disaster
summary(hm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.511e+00 1.998e-01 37.594 < 2e-16 ***
## year2021 5.362e-02 2.429e-02 2.208 0.0273 *
## year2022 7.902e-01 3.857e-02 20.485 < 2e-16 ***
## year2023 6.866e-01 3.720e-02 18.457 < 2e-16 ***
## weekdayMon 1.369e-01 8.297e-03 16.496 < 2e-16 ***
## weekdayTue 2.172e-01 8.229e-03 26.400 < 2e-16 ***
## weekdayWed 2.380e-01 8.221e-03 28.949 < 2e-16 ***
## weekdayThu 2.190e-01 8.219e-03 26.642 < 2e-16 ***
## weekdayFri 2.048e-01 8.222e-03 24.906 < 2e-16 ***
## weekdaySat 9.189e-02 8.207e-03 11.198 < 2e-16 ***
## wd_avg -3.411e-04 2.608e-05 -13.080 < 2e-16 ***
## ws_avg -4.306e-02 8.735e-04 -49.299 < 2e-16 ***
## I(1/dist_wrp^2) -5.954e-01 2.387e-02 -24.946 < 2e-16 ***
## I(1/dist_ref^2) 5.514e+00 2.210e-01 24.946 < 2e-16 ***
## I(1/dist_dc^2) 4.001e+03 1.604e+02 24.946 < 2e-16 ***
## active_2km 1.363e-02 1.446e-03 9.429 < 2e-16 ***
## inactive_2km -6.320e-02 4.472e-03 -14.132 < 2e-16 ***
## elevation -7.581e-04 1.921e-03 -0.395 0.6931
## EVI -1.173e+00 8.060e-02 -14.553 < 2e-16 ***
## num_odor_complaints 1.364e-02 6.396e-04 21.318 < 2e-16 ***
## closest_wrp_capacity -7.191e-03 4.534e-04 -15.858 < 2e-16 ***
## hourly_downwind_ref -6.138e-02 7.893e-03 -7.776 7.53e-15 ***
## hourly_downwind_wrp 4.410e-02 8.714e-03 5.061 4.17e-07 ***
## hourly_temp -3.956e-02 4.779e-04 -82.787 < 2e-16 ***
## hourly_hum -1.116e-02 1.636e-04 -68.199 < 2e-16 ***
## hourly_precip -5.181e-02 1.950e-01 -0.266 0.7905
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.965 8.000 242.2
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.945 8.992 253.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.659 80.000 187.1
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.15 Deviance explained = 15%
## GCV = 1.6295 Scale est. = 1.6289 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(hm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.670e+02 1.413e+01 -25.969 < 2e-16 ***
## year2021 2.018e+01 1.606e+00 12.568 < 2e-16 ***
## year2022 2.582e+01 2.078e+00 12.422 < 2e-16 ***
## year2023 1.517e+01 1.950e+00 7.781 7.20e-15 ***
## weekdayMon -1.362e+00 4.926e-01 -2.765 0.00569 **
## weekdayTue -1.285e+00 4.887e-01 -2.631 0.00853 **
## weekdayWed -7.367e-01 4.881e-01 -1.509 0.13121
## weekdayThu -2.720e-01 4.879e-01 -0.557 0.57721
## weekdayFri 4.210e-01 4.874e-01 0.864 0.38773
## weekdaySat 3.651e-01 4.873e-01 0.749 0.45366
## wd_avg -1.679e-02 1.523e-03 -11.024 < 2e-16 ***
## ws_avg 3.779e-01 5.220e-02 7.241 4.46e-13 ***
## I(1/dist_wrp^2) 8.737e-06 8.794e-06 0.993 0.32047
## I(1/dist_ref^2) -7.063e-03 4.412e-04 -16.010 < 2e-16 ***
## I(1/dist_dc^2) 5.548e-01 2.627e-02 21.119 < 2e-16 ***
## monthly_oil_2km 8.650e-04 1.890e-04 4.576 4.74e-06 ***
## monthly_gas_2km 1.396e-04 4.037e-04 0.346 0.72950
## active_2km 7.637e-01 9.040e-02 8.449 < 2e-16 ***
## inactive_2km 4.959e+00 2.540e-01 19.525 < 2e-16 ***
## elevation -2.749e+00 1.138e-01 -24.162 < 2e-16 ***
## EVI -1.210e+02 4.664e+00 -25.946 < 2e-16 ***
## num_odor_complaints 1.268e+00 2.147e-02 59.044 < 2e-16 ***
## closest_wrp_capacity 9.131e-01 3.104e-02 29.419 < 2e-16 ***
## hourly_downwind_ref -4.366e+00 4.604e-01 -9.484 < 2e-16 ***
## hourly_downwind_wrp 3.046e+00 5.165e-01 5.898 3.68e-09 ***
## hourly_temp -4.631e-01 2.729e-02 -16.972 < 2e-16 ***
## hourly_hum -1.129e-01 9.201e-03 -12.272 < 2e-16 ***
## disaster 2.610e+01 1.326e+00 19.679 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.933 8 46.65
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 98.34
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.203 80 82.06
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.0505 Deviance explained = 5.09%
## GCV = 6256.4 Scale est. = 6254.3 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(hm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.333e+02 1.399e+01 -23.814 < 2e-16 ***
## year2021 6.996e-01 1.264e+00 0.554 0.5799
## year2022 1.328e+00 1.667e+00 0.797 0.4255
## year2023 -1.128e+00 1.757e+00 -0.642 0.5206
## weekdayMon -1.366e+00 4.929e-01 -2.771 0.0056 **
## weekdayTue -1.250e+00 4.889e-01 -2.556 0.0106 *
## weekdayWed -7.087e-01 4.884e-01 -1.451 0.1467
## weekdayThu -2.809e-01 4.882e-01 -0.575 0.5651
## weekdayFri 4.593e-01 4.877e-01 0.942 0.3463
## weekdaySat 4.126e-01 4.875e-01 0.846 0.3974
## wd_avg -1.683e-02 1.526e-03 -11.031 < 2e-16 ***
## ws_avg 3.468e-01 5.259e-02 6.594 4.30e-11 ***
## I(1/dist_wrp^2) 1.139e-05 8.281e-06 1.375 0.1691
## I(1/dist_ref^2) -6.810e-03 4.205e-04 -16.196 < 2e-16 ***
## I(1/dist_dc^2) 5.209e-01 2.643e-02 19.704 < 2e-16 ***
## monthly_oil_2km 9.888e-04 1.878e-04 5.266 1.40e-07 ***
## monthly_gas_2km 5.603e-05 3.883e-04 0.144 0.8853
## active_2km 5.029e-01 8.908e-02 5.645 1.65e-08 ***
## inactive_2km 5.486e+00 2.509e-01 21.870 < 2e-16 ***
## elevation -2.897e+00 1.134e-01 -25.554 < 2e-16 ***
## EVI -1.179e+02 4.660e+00 -25.306 < 2e-16 ***
## num_odor_complaints 1.315e+00 2.135e-02 61.602 < 2e-16 ***
## closest_wrp_capacity 9.232e-01 3.095e-02 29.826 < 2e-16 ***
## hourly_downwind_ref -4.424e+00 4.607e-01 -9.602 < 2e-16 ***
## hourly_downwind_wrp 3.119e+00 5.167e-01 6.036 1.58e-09 ***
## hourly_temp -4.497e-01 2.734e-02 -16.450 < 2e-16 ***
## hourly_hum -1.123e-01 9.283e-03 -12.096 < 2e-16 ***
## hourly_precip -2.137e+00 1.067e+01 -0.200 0.8412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.936 8 55.46
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 96.91
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 77.617 80 79.98
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.0495 Deviance explained = 4.99%
## GCV = 6263 Scale est. = 6260.9 n = 367838
Since February 2022
# Since feb 2022
summary(log_hm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.479e+00 6.855e-02 50.745 < 2e-16 ***
## year2023 -1.173e-01 1.715e-02 -6.840 7.92e-12 ***
## weekdayMon 1.137e-01 6.151e-03 18.479 < 2e-16 ***
## weekdayTue 1.806e-01 6.096e-03 29.626 < 2e-16 ***
## weekdayWed 1.744e-01 6.111e-03 28.536 < 2e-16 ***
## weekdayThu 1.292e-01 6.115e-03 21.126 < 2e-16 ***
## weekdayFri 1.427e-01 6.093e-03 23.423 < 2e-16 ***
## weekdaySat 8.645e-02 6.075e-03 14.229 < 2e-16 ***
## wd_avg -4.281e-04 1.955e-05 -21.905 < 2e-16 ***
## ws_avg -6.686e-02 7.642e-04 -87.498 < 2e-16 ***
## I(1/dist_wrp^2) 6.971e-07 7.730e-08 9.018 < 2e-16 ***
## I(1/dist_ref^2) 2.743e-07 1.763e-06 0.156 0.876375
## I(1/dist_dc^2) -5.415e-04 2.683e-05 -20.182 < 2e-16 ***
## monthly_oil_2km -5.215e-06 2.388e-06 -2.184 0.028956 *
## monthly_gas_2km -3.444e-05 6.993e-06 -4.924 8.48e-07 ***
## active_2km 2.074e-02 1.913e-03 10.840 < 2e-16 ***
## inactive_2km -1.233e-02 3.975e-03 -3.102 0.001923 **
## elevation -3.940e-02 3.140e-03 -12.548 < 2e-16 ***
## EVI -2.347e+00 7.235e-02 -32.431 < 2e-16 ***
## num_odor_complaints 1.645e-02 7.961e-04 20.659 < 2e-16 ***
## hourly_downwind_ref -4.962e-03 5.975e-03 -0.831 0.406204
## hourly_downwind_wrp 7.644e-02 6.905e-03 11.070 < 2e-16 ***
## hourly_temp -3.153e-02 3.658e-04 -86.202 < 2e-16 ***
## hourly_hum -1.023e-02 1.298e-04 -78.825 < 2e-16 ***
## hourly_precip 4.100e-01 1.112e-01 3.685 0.000228 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.832 8 113.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.999 9 223.6
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.996 80 658.9
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 119/122
## R-sq.(adj) = 0.495 Deviance explained = 49.6%
## GCV = 0.40679 Scale est. = 0.40648 n = 153718
Disaster Only
# Disaster only
summary(log_hm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.300e+01 2.000e+00 -11.498 < 2e-16 ***
## month11 1.587e-01 3.764e-02 4.216 2.50e-05 ***
## month12 -7.281e-02 3.887e-02 -1.873 0.061078 .
## weekdayMon 5.560e-02 1.941e-02 2.864 0.004188 **
## weekdayTue -7.745e-02 1.948e-02 -3.976 7.04e-05 ***
## weekdayWed 1.200e-01 1.923e-02 6.242 4.37e-10 ***
## weekdayThu 1.086e-01 1.940e-02 5.597 2.20e-08 ***
## weekdayFri 3.229e-02 1.879e-02 1.719 0.085638 .
## weekdaySat -7.210e-02 1.900e-02 -3.795 0.000148 ***
## wd_avg -7.589e-04 5.254e-05 -14.443 < 2e-16 ***
## ws_avg -1.126e-01 2.863e-03 -39.351 < 2e-16 ***
## I(1/dist_wrp^2) -4.404e-05 3.692e-06 -11.929 < 2e-16 ***
## I(1/dist_ref^2) -2.540e-04 2.117e-05 -11.999 < 2e-16 ***
## I(1/dist_dc^2) 3.175e-01 2.604e-02 12.191 < 2e-16 ***
## monthly_oil_2km 6.092e-04 4.319e-05 14.105 < 2e-16 ***
## active_2km -4.152e-01 1.346e-02 -30.847 < 2e-16 ***
## inactive_2km 7.570e-01 4.627e-02 16.360 < 2e-16 ***
## elevation -2.215e-01 1.232e-02 -17.975 < 2e-16 ***
## num_odor_complaints 4.079e-03 4.842e-04 8.424 < 2e-16 ***
## closest_wrp_capacity 6.850e-02 3.809e-03 17.985 < 2e-16 ***
## hourly_downwind_ref -1.588e-01 1.574e-02 -10.090 < 2e-16 ***
## hourly_downwind_wrp 2.311e-01 2.005e-02 11.528 < 2e-16 ***
## hourly_temp -3.327e-02 1.039e-03 -32.031 < 2e-16 ***
## hourly_hum -9.715e-03 3.157e-04 -30.775 < 2e-16 ***
## hourly_precip 1.019e+00 2.722e-01 3.744 0.000181 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 238.98
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.36 80 89.27
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.56 Deviance explained = 56.2%
## GCV = 0.77762 Scale est. = 0.77478 n = 30242
Exclude Disaster
# Exclude disaster
summary(log_hm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.634e+00 1.343e-01 34.500 < 2e-16 ***
## year2021 9.424e-02 1.318e-02 7.148 8.82e-13 ***
## year2022 5.780e-01 2.088e-02 27.687 < 2e-16 ***
## year2023 6.159e-01 2.018e-02 30.512 < 2e-16 ***
## weekdayMon 1.199e-01 4.493e-03 26.683 < 2e-16 ***
## weekdayTue 1.721e-01 4.455e-03 38.635 < 2e-16 ***
## weekdayWed 1.958e-01 4.451e-03 43.977 < 2e-16 ***
## weekdayThu 1.770e-01 4.451e-03 39.768 < 2e-16 ***
## weekdayFri 1.646e-01 4.452e-03 36.969 < 2e-16 ***
## weekdaySat 8.162e-02 4.444e-03 18.368 < 2e-16 ***
## wd_avg -6.350e-04 1.412e-05 -44.959 < 2e-16 ***
## ws_avg -4.240e-02 4.730e-04 -89.640 < 2e-16 ***
## I(1/dist_wrp^2) 1.299e-06 1.247e-07 10.419 < 2e-16 ***
## I(1/dist_ref^2) -4.757e-05 5.521e-06 -8.617 < 2e-16 ***
## I(1/dist_dc^2) 1.379e-03 1.904e-04 7.240 4.49e-13 ***
## monthly_oil_2km -1.082e-05 1.834e-06 -5.903 3.57e-09 ***
## monthly_gas_2km -5.363e-05 3.621e-06 -14.812 < 2e-16 ***
## active_2km 2.563e-02 8.506e-04 30.131 < 2e-16 ***
## inactive_2km -4.244e-02 2.468e-03 -17.200 < 2e-16 ***
## elevation 4.822e-03 1.032e-03 4.674 2.95e-06 ***
## EVI -2.087e+00 4.330e-02 -48.198 < 2e-16 ***
## num_odor_complaints 4.266e-03 3.464e-04 12.314 < 2e-16 ***
## closest_wrp_capacity -5.463e-03 2.970e-04 -18.395 < 2e-16 ***
## hourly_downwind_ref -3.337e-02 4.271e-03 -7.814 5.55e-15 ***
## hourly_downwind_wrp 8.360e-02 4.722e-03 17.703 < 2e-16 ***
## hourly_temp -3.081e-02 2.588e-04 -119.072 < 2e-16 ***
## hourly_hum -8.804e-03 8.857e-05 -99.407 < 2e-16 ***
## hourly_precip -6.942e-01 1.056e-01 -6.574 4.92e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.978 8 876.7
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 1206.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.979 80 1125.7
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.411 Deviance explained = 41.1%
## GCV = 0.47774 Scale est. = 0.47757 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(log_hm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip +
## disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.489e-01 1.333e-01 7.118 1.09e-12 ***
## year2021 8.889e-01 1.512e-02 58.793 < 2e-16 ***
## year2022 1.034e+00 1.960e-02 52.740 < 2e-16 ***
## year2023 9.078e-01 1.848e-02 49.122 < 2e-16 ***
## weekdayMon 1.078e-01 4.608e-03 23.389 < 2e-16 ***
## weekdayTue 1.440e-01 4.571e-03 31.503 < 2e-16 ***
## weekdayWed 1.841e-01 4.566e-03 40.324 < 2e-16 ***
## weekdayThu 1.651e-01 4.565e-03 36.163 < 2e-16 ***
## weekdayFri 1.471e-01 4.560e-03 32.271 < 2e-16 ***
## weekdaySat 6.604e-02 4.558e-03 14.491 < 2e-16 ***
## wd_avg -7.062e-04 1.426e-05 -49.513 < 2e-16 ***
## ws_avg -4.431e-02 4.920e-04 -90.057 < 2e-16 ***
## I(1/dist_wrp^2) 4.094e-07 1.043e-07 3.924 8.71e-05 ***
## I(1/dist_ref^2) -7.222e-05 5.171e-06 -13.968 < 2e-16 ***
## I(1/dist_dc^2) 7.702e-03 2.318e-04 33.222 < 2e-16 ***
## monthly_oil_2km -6.735e-06 1.786e-06 -3.772 0.000162 ***
## monthly_gas_2km -2.692e-05 4.007e-06 -6.717 1.86e-11 ***
## active_2km 3.392e-02 8.637e-04 39.277 < 2e-16 ***
## inactive_2km -1.671e-02 2.482e-03 -6.732 1.67e-11 ***
## elevation -1.697e-02 1.067e-03 -15.901 < 2e-16 ***
## EVI -2.988e+00 4.368e-02 -68.400 < 2e-16 ***
## num_odor_complaints 1.256e-02 2.009e-04 62.519 < 2e-16 ***
## closest_wrp_capacity 1.663e-03 2.927e-04 5.681 1.34e-08 ***
## hourly_downwind_ref -7.028e-02 4.308e-03 -16.314 < 2e-16 ***
## hourly_downwind_wrp 1.035e-01 4.832e-03 21.420 < 2e-16 ***
## hourly_temp -3.352e-02 2.557e-04 -131.116 < 2e-16 ***
## hourly_hum -9.410e-03 8.680e-05 -108.406 < 2e-16 ***
## hourly_precip -7.807e-01 9.974e-02 -7.828 4.99e-15 ***
## disaster 8.926e-01 1.244e-02 71.741 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.967 8 497.7
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.995 9 595.7
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.896 80 1067.3
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 123/126
## R-sq.(adj) = 0.418 Deviance explained = 41.8%
## GCV = 0.54738 Scale est. = 0.5472 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(log_hm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + year + weekday + wd_avg + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.054e+00 1.333e-01 15.405 < 2e-16 ***
## year2021 2.187e-01 1.197e-02 18.275 < 2e-16 ***
## year2022 1.921e-01 1.581e-02 12.152 < 2e-16 ***
## year2023 3.404e-01 1.682e-02 20.237 < 2e-16 ***
## weekdayMon 1.076e-01 4.640e-03 23.196 < 2e-16 ***
## weekdayTue 1.452e-01 4.603e-03 31.540 < 2e-16 ***
## weekdayWed 1.850e-01 4.597e-03 40.247 < 2e-16 ***
## weekdayThu 1.647e-01 4.596e-03 35.838 < 2e-16 ***
## weekdayFri 1.484e-01 4.591e-03 32.326 < 2e-16 ***
## weekdaySat 6.766e-02 4.589e-03 14.743 < 2e-16 ***
## wd_avg -7.079e-04 1.436e-05 -49.287 < 2e-16 ***
## ws_avg -4.538e-02 4.952e-04 -91.627 < 2e-16 ***
## I(1/dist_wrp^2) 7.347e-07 1.029e-07 7.138 9.47e-13 ***
## I(1/dist_ref^2) -7.509e-05 5.212e-06 -14.406 < 2e-16 ***
## I(1/dist_dc^2) 6.627e-03 2.277e-04 29.104 < 2e-16 ***
## monthly_oil_2km -3.143e-06 1.797e-06 -1.749 0.0803 .
## monthly_gas_2km -2.432e-05 4.037e-06 -6.024 1.70e-09 ***
## active_2km 2.440e-02 8.599e-04 28.379 < 2e-16 ***
## inactive_2km 3.587e-03 2.486e-03 1.443 0.1491
## elevation -2.241e-02 1.072e-03 -20.905 < 2e-16 ***
## EVI -2.888e+00 4.396e-02 -65.692 < 2e-16 ***
## num_odor_complaints 1.417e-02 2.010e-04 70.489 < 2e-16 ***
## closest_wrp_capacity 2.133e-03 2.947e-04 7.235 4.66e-13 ***
## hourly_downwind_ref -7.222e-02 4.338e-03 -16.649 < 2e-16 ***
## hourly_downwind_wrp 1.058e-01 4.865e-03 21.751 < 2e-16 ***
## hourly_temp -3.306e-02 2.574e-04 -128.472 < 2e-16 ***
## hourly_hum -9.386e-03 8.740e-05 -107.398 < 2e-16 ***
## hourly_precip -8.380e-01 1.004e-01 -8.344 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.937 8 601.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.994 9 511.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.929 80 1056.5
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/125
## R-sq.(adj) = 0.41 Deviance explained = 41%
## GCV = 0.55505 Scale est. = 0.55486 n = 367838
adj_r2 <- function(r2, n, p){
# n-p since p here includes intercept
return(1 - (1-r2)*(n - 1)/(n - p))
}
get_bt_adj_r2 <- function(name, response, daterange) {
data <- get_data(response, daterange)
model <- get(paste0(name,'_', daterange, '_gam'))
response_colname <- names(model$model)[1]
response_colname <- str_sub(response_colname, 5, -2)
predictions <- predict(model, newdata = data)
bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))
bt_adj_r2 <- adj_r2(bt_R2, summary(model)$n, summary(model)$np)
return(bt_adj_r2)
}
# comp r2
data <- get_data('log(H2S_daily_avg)', 'dis_ind')
model <- get(paste0('log_da','_', 'dis_ind', '_gam'))
response_colname <- names(model$model)[1]
response_colname <- str_sub(response_colname, 5, -2)
predictions <- predict(model, newdata = data)
bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))
print('R2 computed using R2() function from caret package')
## [1] "R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname) %>% log(), predictions)
## [1] 0.5621578
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname) %>% log(), predictions),
summary(model)$n, summary(model)$np)
## [1] 0.5587909
print('BT-R2 computed using R2() function from caret package')
## [1] "BT-R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname), exp(predictions))
## [1] 0.0004870551
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname), exp(predictions)),
summary(model)$n, summary(model)$np)
## [1] -0.007199022
print('Returned by get_bt_adj_r2')
## [1] "Returned by get_bt_adj_r2"
get_bt_adj_r2('log_da', 'log(H2S_daily_avg)', 'dis_ind')
## [1] -0.007199022
print('Fit obs(y) ~ exp(predicted y)')
## [1] "Fit obs(y) ~ exp(predicted y)"
model_r2 <- lm(data %>% pull(response_colname) ~ exp(predictions))
summary(model_r2)$r.sq
## [1] 0.0004870551
summary(model_r2)$df[1] + summary(model_r2)$df[2]
## [1] 15595
summary(model_r2)$df[1]
## [1] 2
summary(model_r2)$adj.r.squared
## [1] 0.000422955
adj_r2(summary(model_r2)$r.sq,
summary(model_r2)$df[1] + summary(model_r2)$df[2],
summary(model_r2)$df[1])
## [1] 0.000422955
date_names <- c('Since Feb 2022', 'Disaster Only', 'Exclude Disaster',
'Everything w. Disaster Indicator',
'Everything w.o Disaster Indicator')
response_disp_names <- c('Daily Avg', 'Log Daily Avg', 'Daily Max', 'Log Daily Max',
'Hourly Avg', 'Log Hourly Avg', 'Hourly Max', 'Log Hourly Max')
gam_result_table <- expand.grid(date_names, response_disp_names) %>%
setNames(c('date_names', 'response_disp_names'))
date_name_conversion <- tibble(date_names = date_names,
daterange = dateranges)
response_name_conversion <- tibble(response_disp_names = unique(gam_result_table$response_disp_names),
response_obj_name = c(daily_responses, hourly_responses),
model_response_name = response_names,
transformation = rep(c('', 'Log'), 4))
gam_result_table <- gam_result_table %>%
left_join(date_name_conversion) %>%
left_join(response_name_conversion)
## Joining with `by = join_by(date_names)`
## Joining with `by = join_by(response_disp_names)`
gam_result_table <- gam_result_table %>%
mutate(adjr2 = NA,
bt_adjr2 = NA,
n = NA,
p = NA)
for (i in 1:nrow(gam_result_table)) {
name <- gam_result_table$model_response_name[i]
response <- gam_result_table$response_obj_name[i]
daterange <- gam_result_table$daterange[i]
model <- get(paste0(name,'_', daterange, '_gam'))
if (str_detect(response, 'log\\(')) {
bt_adjr2 <- get_bt_adj_r2(name, response, daterange)
} else {
bt_adjr2 <- NA
}
adjr2 <- summary(model)$r.sq
n <- summary(model)$n
p <- summary(model)$np
new_columns <- tibble(adjr2 = adjr2, bt_adjr2 = bt_adjr2, n = n, p = p)
new_row <- bind_cols(gam_result_table[i, 1:6], new_columns)
gam_result_table[i, ] <- new_row
print(str_glue('Completed {i} iterations'))
}
## Completed 1 iterations
## Completed 2 iterations
## Completed 3 iterations
## Completed 4 iterations
## Completed 5 iterations
## Completed 6 iterations
## Completed 7 iterations
## Completed 8 iterations
## Completed 9 iterations
## Completed 10 iterations
## Completed 11 iterations
## Completed 12 iterations
## Completed 13 iterations
## Completed 14 iterations
## Completed 15 iterations
## Completed 16 iterations
## Completed 17 iterations
## Completed 18 iterations
## Completed 19 iterations
## Completed 20 iterations
## Completed 21 iterations
## Completed 22 iterations
## Completed 23 iterations
## Completed 24 iterations
## Completed 25 iterations
## Completed 26 iterations
## Completed 27 iterations
## Completed 28 iterations
## Completed 29 iterations
## Completed 30 iterations
## Completed 31 iterations
## Completed 32 iterations
## Completed 33 iterations
## Completed 34 iterations
## Completed 35 iterations
## Completed 36 iterations
## Completed 37 iterations
## Completed 38 iterations
## Completed 39 iterations
## Completed 40 iterations
temp <- rep(rep(response_disp_names[!str_detect(response_disp_names, 'Log')], each =2), 5)
base_table <- gam_result_table %>%
arrange(factor(date_names, levels = .env$date_names)) %>%
mutate(response_base = temp,
`bt_adjr2` = '') %>%
filter(transformation == '') %>%
select(all_of(c('date_names', 'response_base', 'model_response_name', 'adjr2', 'bt_adjr2', 'n', 'p'))) %>%
select(-bt_adjr2)
log_table <- gam_result_table %>%
arrange(factor(date_names, levels = .env$date_names)) %>%
mutate(response_base = temp) %>%
filter(transformation == 'Log') %>%
select(all_of(c('date_names', 'response_base', 'model_response_name','adjr2', 'bt_adjr2', 'n', 'p')))
gam_result_table_fordisp <- base_table %>%
left_join(log_table, join_by(date_names, response_base)) %>%
select(-'date_names', -starts_with('model_response_name')) %>%
setNames(c('Response', c('Adj.R2', 'N', 'P'), c('Adj.R2', 'BT-Adj.R2', 'N', 'P')))
gam_result_table_kable <- gam_result_table_fordisp %>%
knitr::kable(format = 'latex', digits = 2) %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))
writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')
gam_result_table_fordisp %>%
knitr::kable(format = 'html', digits = 2, table.attr = "style='width:100%;'") %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))
| Response | Adj.R2 | N | P | Adj.R2 | BT-Adj.R2 | N | P |
|---|---|---|---|---|---|---|---|
| Since Feb 2022 | |||||||
| Daily Avg | 0.62 | 6531 | 117 | 0.70 | 0.61 | 6531 | 116 |
| Daily Max | 0.22 | 6531 | 116 | 0.54 | 0.22 | 6531 | 115 |
| Hourly Avg | 0.35 | 153718 | 123 | 0.55 | 0.34 | 153718 | 122 |
| Hourly Max | 0.17 | 153718 | 122 | 0.50 | 0.16 | 153718 | 122 |
| Disaster Only | |||||||
| Daily Avg | 0.42 | 1273 | 105 | 0.75 | 0.55 | 1273 | 111 |
| Daily Max | 0.43 | 1273 | 99 | 0.72 | 0.44 | 1273 | 110 |
| Hourly Avg | 0.21 | 30242 | 114 | 0.58 | 0.22 | 30242 | 114 |
| Hourly Max | 0.22 | 30242 | 114 | 0.56 | 0.24 | 30242 | 114 |
| Exclude Disaster | |||||||
| Daily Avg | 0.46 | 14322 | 120 | 0.57 | 0.46 | 14322 | 121 |
| Daily Max | 0.12 | 14322 | 115 | 0.48 | 0.11 | 14322 | 120 |
| Hourly Avg | 0.25 | 337596 | 124 | 0.44 | 0.25 | 337596 | 125 |
| Hourly Max | 0.15 | 337596 | 123 | 0.41 | 0.14 | 337596 | 125 |
| Everything w D.I | |||||||
| Daily Avg | 0.11 | 15595 | 115 | 0.56 | -0.01 | 15595 | 120 |
| Daily Max | 0.12 | 15595 | 115 | 0.51 | -0.01 | 15595 | 120 |
| Hourly Avg | 0.05 | 367838 | 126 | 0.44 | 0.00 | 367838 | 125 |
| Hourly Max | 0.05 | 367838 | 125 | 0.42 | 0.00 | 367838 | 126 |
| Everything w.o D.I | |||||||
| Daily Avg | 0.11 | 15595 | 114 | 0.54 | -0.01 | 15595 | 119 |
| Daily Max | 0.12 | 15595 | 115 | 0.49 | -0.01 | 15595 | 120 |
| Hourly Avg | 0.05 | 367838 | 125 | 0.44 | 0.00 | 367838 | 124 |
| Hourly Max | 0.05 | 367838 | 125 | 0.41 | 0.00 | 367838 | 125 |
validation_result <- tibble(Model = character(),
model_response_name = character(),
daterange = character(),
'Coef' = character(),
'R-Sq' = numeric(),
'Disaster RMSE' = numeric(),
'Normal RMSE' = numeric())
xgb_result <- tibble(Model = character(),
model_response_name = character(),
daterange = character(),
'R-Sq' = numeric(),
'BT R-Sq' = numeric(),
'RMSE' = numeric(),
'BT RMSE' = numeric())
fit.xgb_da_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_sincefeb2022)
fit.xgb_da_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 700
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 141 700 6 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x",
"Latitude" = "mon_utm_y",
"Distance to Refinery" = "dist_ref",
"Angle to Refinery" = "angle_ref",
"Active Wells within 2km" = "active_2km",
"Inactive Wells within 2km" = "inactive_2km",
"Monthly Oil Production 2km" = "monthly_oil_2km",
"Monthly Gas Production 2km" = "monthly_gas_2km",
"Distance to WRP" = "dist_wrp",
"WRP Capacity" = "closest_wrp_capacity",
"Angle to WRP" = "angle_wrp",
"Distance to Dominguez Channel" = "dist_dc",
"Average Daily Temperature" = "daily_temp",
"Average Daily Humidity" = "daily_hum",
"Daily Precipitation" = "daily_precip",
"Average Daily Wind Speed" = "ws_avg",
"Average Daily Wind Direction" = "wd_avg",
"Downwind Refinery" = "daily_downwind_ref",
"Downwind WRP" = "daily_downwind_wrp",
"Elevation" = "elevation",
"Enhanced Vegetation Index" = "EVI",
"Number of Daily Odor Complaints" = "num_odor_complaints",
"2020" = "year_2020",
"2021" = "year_2021",
"2022" = "year_2022",
"2023" = "year_2023",
"January" = "month_01",
"February" = "month_02",
"March" = "month_03",
"April" = "month_04",
"May" = "month_05",
"June" = "month_06",
"July" = "month_07",
"August" = "month_08",
"September" = "month_09",
"October" = "month_10",
"November" = "month_11",
"December" = "month_12",
"Monday" = "weekday_Mon",
"Tuesday" = "weekday_Tue",
"Wednesday" = "weekday_Wed",
"Thursday" = "weekday_Thu",
"Friday" = "weekday_Fri",
"Saturday" = "weekday_Sat",
"Sunday" = "weekday_Sun",
"Odor Event" = "disaster")
imp<-varImp(fit.xgb_da_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_sincefeb2022$pred$obs, pred = fit.xgb_da_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'da',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
fit.xgb_da_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
fit.xgb_da_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_sincefeb2022$trainingData),
fit.xgb_da_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'da',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 868.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 300
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 121 300 6 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_da_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'da',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_da_dis$pred$obs ~
fit.xgb_da_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_dis$pred$obs ~
fit.xgb_da_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_dis$trainingData),
fit.xgb_da_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'da',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_da_excl_dis.rds')
getTrainPerf(fit.xgb_da_excl_dis)
fit.xgb_da_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 150 700 6 0.1 0.01 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_da_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_excl_dis$pred$obs,
pred = fit.xgb_da_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'da',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
fit.xgb_da_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
fit.xgb_da_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_excl_dis$trainingData),
fit.xgb_da_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'da',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis_ind.rds')
getTrainPerf(fit.xgb_da_dis_ind)
fit.xgb_da_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.5", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 21 700 4 0.1 0.001 1 0 0.5
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_light()
ggsave('../figures/dis_ind_da_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_da_dis_ind$pred$obs,
pred = fit.xgb_da_dis_ind$pred$pred,
disaster = fit.xgb_da_dis_ind$trainingData$disaster[fit.xgb_da_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_da_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
da_dis_ind_obs_vs_pred_plot <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
theme_bw()
da_dis_ind_obs_vs_pred_plot_zoom <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
coord_cartesian(xlim = c(0, 10), ylim = c(0, 10)) +
theme_bw()
ggarrange(da_dis_ind_obs_vs_pred_plot, da_dis_ind_obs_vs_pred_plot_zoom,
labels = c("1", "2"), nrow = 1)
ggsave('../figures/dis_ind_da_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'da',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
fit.xgb_da_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
fit.xgb_da_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_dis_ind$trainingData),
fit.xgb_da_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'da',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 243 700 5 0.3 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_full$pred$obs,
pred = fit.xgb_da_full$pred$pred,
disaster = if_else(fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_da_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'da',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_full$trainingData),
fit.xgb_da_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'da',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
knitr::kable(validation_result, digits = 3)
| Model | model_response_name | daterange | Coef | R-Sq | Disaster RMSE | Normal RMSE |
|---|---|---|---|---|---|---|
| Since Feb 2022 | da | sincefeb2022 | 1.012 | 0.759 | NA | NA |
| Disaster Only | da | dis | 1.172 | 0.642 | NA | NA |
| Exclude Disaster | da | excl_dis | 1.004 | 0.673 | NA | NA |
| Everything w. Disaster Indicator | da | dis_ind | 1.063 | 0.970 | 15.828 | 0.477 |
| Everything w.o Disaster Indicator | da | full | 1.049 | 0.989 | 9.919 | 0.137 |
fit.xgb_da_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_log_h2s_sincefeb2022)
fit.xgb_da_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 700
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123 700 6 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$obs),
pred = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_da',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData),
fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData),
fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_da',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis)
fit.xgb_da_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 700
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 69 700 5 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis$pred$obs),
pred = exp(fit.xgb_da_log_h2s_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_da',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
fit.xgb_da_log_h2s_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
fit.xgb_da_log_h2s_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_dis$trainingData),
fit.xgb_da_log_h2s_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_dis$trainingData),
fit.xgb_da_log_h2s_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_da',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_excl_dis)
fit.xgb_da_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123 700 6 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_excl_dis$pred$obs),
pred = exp(fit.xgb_da_log_h2s_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_da',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
fit.xgb_da_log_h2s_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
fit.xgb_da_log_h2s_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_excl_dis$trainingData),
fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_excl_dis$trainingData),
fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_da',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis_ind)
fit.xgb_da_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285 700 6 0.3 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_log_h2s_dis_ind$pred$obs,
pred = fit.xgb_da_log_h2s_dis_ind$pred$pred,
disaster = fit.xgb_da_log_h2s_dis_ind$trainingData$disaster[fit.xgb_da_log_h2s_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis_ind$pred$obs),
pred = exp(fit.xgb_da_log_h2s_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_da',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
fit.xgb_da_log_h2s_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
fit.xgb_da_log_h2s_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_dis_ind$trainingData),
fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_dis_ind$trainingData),
fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_da',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_full.rds')
getTrainPerf(fit.xgb_da_log_h2s_full)
fit.xgb_da_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285 700 6 0.3 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_log_h2s_full$pred$obs,
pred = fit.xgb_da_log_h2s_full$pred$pred,
disaster = if_else(fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_full$pred$obs),
pred = exp(fit.xgb_da_log_h2s_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_da',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
fit.xgb_da_log_h2s_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
fit.xgb_da_log_h2s_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_full$trainingData),
fit.xgb_da_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_full$trainingData),
fit.xgb_da_log_h2s_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_da',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_sincefeb2022)
fit.xgb_dm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 300
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 151 300 6 0.1 0.01 0.75 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_sincefeb2022$pred$obs, pred = fit.xgb_dm_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'dm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
fit.xgb_dm_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
fit.xgb_dm_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_sincefeb2022$trainingData),
fit.xgb_dm_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'dm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis.rds')
getTrainPerf(fit.xgb_dm_dis)
fit.xgb_dm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 459.6 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 300
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 43 300 4 0.1 0.01 0.75 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_dm_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'dm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_dm_dis$pred$obs ~
fit.xgb_dm_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_dis$pred$obs ~
fit.xgb_dm_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_dis$trainingData),
fit.xgb_dm_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'dm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_excl_dis.rds')
getTrainPerf(fit.xgb_dm_excl_dis)
fit.xgb_dm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 300
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 142 300 6 0.1 0.01 0.5 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_dm_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_excl_dis$pred$obs,
pred = fit.xgb_dm_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'dm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
fit.xgb_dm_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
fit.xgb_dm_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_excl_dis$trainingData),
fit.xgb_dm_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'dm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis_ind.rds')
getTrainPerf(fit.xgb_dm_dis_ind)
fit.xgb_dm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 294 700 6 0.3 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_light()
ggsave('../figures/dis_ind_dm_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_dm_dis_ind$pred$obs,
pred = fit.xgb_dm_dis_ind$pred$pred,
disaster = fit.xgb_dm_dis_ind$trainingData$disaster[fit.xgb_dm_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_dm_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
dm_dis_ind_obs_vs_pred_plot <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
theme_bw()
dm_dis_ind_obs_vs_pred_plot_zoom <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
ggarrange(dm_dis_ind_obs_vs_pred_plot, dm_dis_ind_obs_vs_pred_plot_zoom,
labels = c("1", "2"), nrow = 1)
ggsave('../figures/dis_ind_dm_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'dm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
fit.xgb_dm_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
fit.xgb_dm_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_dis_ind$trainingData),
fit.xgb_dm_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'dm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_full.rds')
getTrainPerf(fit.xgb_dm_full)
fit.xgb_dm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 231 700 5 0.3 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_full$pred$obs,
pred = fit.xgb_dm_full$pred$pred,
disaster = if_else(fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_dm_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'dm',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
train_adj_r2 <- adj_r2(getTrainPerf(fit.xgb_dm_full)$TrainRsquared,
nrow(fit.xgb_dm_full$trainingData),
fit.xgb_dm_full$finalModel$nfeatures)
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_full$trainingData),
fit.xgb_dm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'dm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_log_h2s_sincefeb2022)
fit.xgb_dm_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 300
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 133 300 6 0.1 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_dm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData),
fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData),
fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_dm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis)
fit.xgb_dm_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 793.6 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 500
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 32 500 4 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_dm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
fit.xgb_dm_log_h2s_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
fit.xgb_dm_log_h2s_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_dis$trainingData),
fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_dis$trainingData),
fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_dm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_excl_dis)
fit.xgb_dm_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 500
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 158 500 6 0.1 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_excl_dis$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_dm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
fit.xgb_dm_log_h2s_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
fit.xgb_dm_log_h2s_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData),
fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData),
fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_dm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis_ind)
fit.xgb_dm_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321 700 6 0.3 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_dis_ind$pred$obs,
pred = fit.xgb_dm_log_h2s_dis_ind$pred$pred,
disaster = fit.xgb_dm_log_h2s_dis_ind$trainingData$disaster[fit.xgb_dm_log_h2s_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis_ind$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
fit.xgb_dm_log_h2s_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
fit.xgb_dm_log_h2s_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData),
fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData),
fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_full.rds')
getTrainPerf(fit.xgb_dm_log_h2s_full)
fit.xgb_dm_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321 700 6 0.3 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_full$pred$obs,
pred = fit.xgb_dm_log_h2s_full$pred$pred,
disaster = if_else(fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_full$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
fit.xgb_dm_log_h2s_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
fit.xgb_dm_log_h2s_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_full$trainingData),
fit.xgb_dm_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_full$trainingData),
fit.xgb_dm_log_h2s_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_ha_sincefeb2022)
fit.xgb_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.7 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x",
"Latitude" = "mon_utm_y",
"Distance to Refinery" = "dist_ref",
"Angle to Refinery" = "angle_ref",
"Active Wells within 2km" = "active_2km",
"Inactive Wells within 2km" = "inactive_2km",
"Monthly Oil Production 2km" = "monthly_oil_2km",
"Monthly Gas Production 2km" = "monthly_gas_2km",
"Distance to WRP" = "dist_wrp",
"WRP Capacity" = "closest_wrp_capacity",
"Angle to WRP" = "angle_wrp",
"Distance to Dominguez Channel" = "dist_dc",
"Hourly Temperature" = "hourly_temp",
"Hourly Humidity" = "hourly_hum",
"Hourly Precipitation" = "hourly_precip",
"Hourly Wind Speed" = "ws_avg",
"Hourly Wind Direction" = "wd_avg",
"Downwind Refinery" = "hourly_downwind_ref",
"Downwind WRP" = "hourly_downwind_wrp",
"Elevation" = "elevation",
"Enhanced Vegetation Index" = "EVI",
"Number of Daily Odor Complaints" = "num_odor_complaints",
"2020" = "year_2020",
"2021" = "year_2021",
"2022" = "year_2022",
"2023" = "year_2023",
"January" = "month_01",
"February" = "month_02",
"March" = "month_03",
"April" = "month_04",
"May" = "month_05",
"June" = "month_06",
"July" = "month_07",
"August" = "month_08",
"September" = "month_09",
"October" = "month_10",
"November" = "month_11",
"December" = "month_12",
"Monday" = "weekday_Mon",
"Tuesday" = "weekday_Tue",
"Wednesday" = "weekday_Wed",
"Thursday" = "weekday_Thu",
"Friday" = "weekday_Fri",
"Saturday" = "weekday_Sat",
"Sunday" = "weekday_Sun",
"Odor Event" = "disaster")
imp<-varImp(fit.xgb_ha_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_ha_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_sincefeb2022$pred$obs, pred = fit.xgb_ha_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'ha',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
fit.xgb_ha_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
fit.xgb_ha_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_sincefeb2022$trainingData),
fit.xgb_ha_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'ha',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis.rds')
getTrainPerf(fit.xgb_ha_dis)
fit.xgb_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 84.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 50
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 51 50 4 0.4 0.001 0.8 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_ha_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'ha',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_ha_dis$pred$obs ~
fit.xgb_ha_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_dis$pred$obs ~
fit.xgb_ha_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_dis$trainingData),
fit.xgb_ha_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'ha',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_excl_dis.rds')
getTrainPerf(fit.xgb_ha_excl_dis)
fit.xgb_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.1 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 52 150 4 0.4 0.001 0.8 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_ha_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_excl_dis$pred$obs,
pred = fit.xgb_ha_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'ha',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
fit.xgb_ha_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
fit.xgb_ha_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_excl_dis$trainingData),
fit.xgb_ha_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'ha',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis_ind.rds')
getTrainPerf(fit.xgb_ha_dis_ind)
fit.xgb_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62 150 4 0.4 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_light()
ggsave('../figures/dis_ind_ha_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_ha_dis_ind$pred$obs,
pred = fit.xgb_ha_dis_ind$pred$pred,
disaster = fit.xgb_ha_dis_ind$trainingData$disaster[fit.xgb_ha_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_ha_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
ha_dis_ind_obs_vs_pred_plot <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
theme_bw()
ha_dis_ind_obs_vs_pred_plot_zoom <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
coord_cartesian(xlim = c(0, 20), ylim = c(0, 20)) +
theme_bw()
ggarrange(ha_dis_ind_obs_vs_pred_plot, ha_dis_ind_obs_vs_pred_plot_zoom,
labels = c("1", "2"), nrow = 1)
ggsave('../figures/dis_ind_ha_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'ha',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
fit.xgb_ha_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
fit.xgb_ha_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_dis_ind$trainingData),
fit.xgb_ha_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'ha',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_full.rds')
getTrainPerf(fit.xgb_ha_full)
fit.xgb_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_ha_full$pred$obs,
pred = fit.xgb_ha_full$pred$pred,
disaster = if_else(fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_ha_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'ha',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_full$trainingData),
fit.xgb_ha_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'ha',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_log_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_ha_sincefeb2022)
fit.xgb_log_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_sincefeb2022$pred$obs),
pred = exp(fit.xgb_log_ha_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_ha',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
fit.xgb_log_ha_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
fit.xgb_log_ha_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_sincefeb2022$trainingData),
fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_sincefeb2022$trainingData),
fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_ha',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis.rds')
getTrainPerf(fit.xgb_log_ha_dis)
fit.xgb_log_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 250.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 150
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis$pred$obs),
pred = exp(fit.xgb_log_ha_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_ha',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
fit.xgb_log_ha_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
fit.xgb_log_ha_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_dis$trainingData),
fit.xgb_log_ha_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_dis$trainingData),
fit.xgb_log_ha_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_ha',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_excl_dis.rds')
getTrainPerf(fit.xgb_log_ha_excl_dis)
fit.xgb_log_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_excl_dis$pred$obs),
pred = exp(fit.xgb_log_ha_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_ha',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
fit.xgb_log_ha_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
fit.xgb_log_ha_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_excl_dis$trainingData),
fit.xgb_log_ha_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_excl_dis$trainingData),
fit.xgb_log_ha_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_ha',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis_ind.rds')
getTrainPerf(fit.xgb_log_ha_dis_ind)
fit.xgb_log_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_ha_dis_ind$pred$obs,
pred = fit.xgb_log_ha_dis_ind$pred$pred,
disaster = fit.xgb_log_ha_dis_ind$trainingData$disaster[fit.xgb_log_ha_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis_ind$pred$obs),
pred = exp(fit.xgb_log_ha_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
fit.xgb_log_ha_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
fit.xgb_log_ha_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_dis_ind$trainingData),
fit.xgb_log_ha_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_dis_ind$trainingData),
fit.xgb_log_ha_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_full.rds')
getTrainPerf(fit.xgb_log_ha_full)
fit.xgb_log_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62 150 4 0.4 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_ha_full$pred$obs,
pred = fit.xgb_log_ha_full$pred$pred,
disaster = if_else(fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_full$pred$obs),
pred = exp(fit.xgb_log_ha_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
fit.xgb_log_ha_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
fit.xgb_log_ha_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_full$trainingData),
fit.xgb_log_ha_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_full$trainingData),
fit.xgb_log_ha_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_hm_sincefeb2022)
fit.xgb_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 30 150 4 0.2 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_hm_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_sincefeb2022$pred$obs, pred = fit.xgb_hm_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'hm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
fit.xgb_hm_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
fit.xgb_hm_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_sincefeb2022$trainingData),
fit.xgb_hm_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'hm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis.rds')
getTrainPerf(fit.xgb_hm_dis)
fit.xgb_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 86.2 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 50
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 29 50 4 0.2 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_hm_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'hm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_hm_dis$pred$obs ~
fit.xgb_hm_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_dis$pred$obs ~
fit.xgb_hm_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_dis$trainingData),
fit.xgb_hm_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'hm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_excl_dis.rds')
getTrainPerf(fit.xgb_hm_excl_dis)
fit.xgb_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.7 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 60 150 4 0.4 0.01 0.8 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_hm_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_excl_dis$pred$obs,
pred = fit.xgb_hm_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'hm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
fit.xgb_hm_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
fit.xgb_hm_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_excl_dis$trainingData),
fit.xgb_hm_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'hm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis_ind.rds')
getTrainPerf(fit.xgb_hm_dis_ind)
fit.xgb_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_light()
ggsave('../figures/dis_ind_hm_fi.png', width = 20, height = 12, units = "cm")
test_result_data <- tibble(obs = fit.xgb_hm_dis_ind$pred$obs,
pred = fit.xgb_hm_dis_ind$pred$pred,
disaster = fit.xgb_hm_dis_ind$trainingData$disaster[fit.xgb_hm_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_hm_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
hm_dis_ind_obs_vs_pred_plot <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
theme_bw()
hm_dis_ind_obs_vs_pred_plot_zoom <-
ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
labs(y = 'Observed', x = 'Predicted', title = '') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "left", label.y = 0.95) +
stat_poly_eq(use_label(c("R2")), label.x = "left", label.y = 0.91) +
stat_poly_eq(use_label(c("n")), label.x = "left", label.y = 0.85) +
coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
theme_bw()
ggarrange(hm_dis_ind_obs_vs_pred_plot, hm_dis_ind_obs_vs_pred_plot_zoom,
labels = c("1", "2"), nrow = 1)
ggsave('../figures/dis_ind_hm_ovp.png', width = 22, height = 12, units = "cm")
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'hm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
fit.xgb_hm_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
fit.xgb_hm_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_dis_ind$trainingData),
fit.xgb_hm_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'hm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_full.rds')
getTrainPerf(fit.xgb_hm_full)
fit.xgb_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 248.7 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_hm_full$pred$obs,
pred = fit.xgb_hm_full$pred$pred,
disaster = if_else(fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_hm_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'hm',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_full$trainingData),
fit.xgb_hm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'hm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_log_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_hm_sincefeb2022)
fit.xgb_log_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_sincefeb2022$pred$obs),
pred = exp(fit.xgb_log_hm_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_hm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
fit.xgb_log_hm_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
fit.xgb_log_hm_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_sincefeb2022$trainingData),
fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_sincefeb2022$trainingData),
fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_hm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis.rds')
getTrainPerf(fit.xgb_log_hm_dis)
fit.xgb_log_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 150
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis$pred$obs),
pred = exp(fit.xgb_log_hm_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_hm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
fit.xgb_log_hm_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
fit.xgb_log_hm_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_dis$trainingData),
fit.xgb_log_hm_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_dis$trainingData),
fit.xgb_log_hm_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_hm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_excl_dis.rds')
getTrainPerf(fit.xgb_log_hm_excl_dis)
fit.xgb_log_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_excl_dis$pred$obs),
pred = exp(fit.xgb_log_hm_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_hm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
fit.xgb_log_hm_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
fit.xgb_log_hm_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_excl_dis$trainingData),
fit.xgb_log_hm_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_excl_dis$trainingData),
fit.xgb_log_hm_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_hm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis_ind.rds')
getTrainPerf(fit.xgb_log_hm_dis_ind)
fit.xgb_log_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_hm_dis_ind$pred$obs,
pred = fit.xgb_log_hm_dis_ind$pred$pred,
disaster = fit.xgb_log_hm_dis_ind$trainingData$disaster[fit.xgb_log_hm_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis_ind$pred$obs),
pred = exp(fit.xgb_log_hm_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
fit.xgb_log_hm_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
fit.xgb_log_hm_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_dis_ind$trainingData),
fit.xgb_log_hm_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_dis_ind$trainingData),
fit.xgb_log_hm_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_full.rds')
getTrainPerf(fit.xgb_log_hm_full)
fit.xgb_log_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_hm_full$pred$obs,
pred = fit.xgb_log_hm_full$pred$pred,
disaster = if_else(fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_full$pred$obs),
pred = exp(fit.xgb_log_hm_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
fit.xgb_log_hm_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
fit.xgb_log_hm_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_full$trainingData),
fit.xgb_log_hm_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_full$trainingData),
fit.xgb_log_hm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
validation_result
xgb_result
base_validation_result <- validation_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == '') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
arrange(factor(Model, levels = unique(validation_result$Model)))
log_validation_result <- validation_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == 'Log') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max'))
base_xgb_result <- xgb_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == '') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
select(-transformation, -`BT R-Sq`, -`BT RMSE`)
log_xgb_result <- xgb_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == 'Log') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
select(-transformation)
# This is the result from regressing the observed on predicted
options(knitr.kable.NA = '')
validation_result_table <- base_validation_result %>%
left_join(log_validation_result, join_by(Model, response_base)) %>%
select(all_of(c('response_base', 'Coef.x', 'R-Sq.x', 'Normal RMSE.x',
'Disaster RMSE.x', 'Coef.y', 'R-Sq.y', 'Normal RMSE.y',
'Disaster RMSE.y'))) %>%
setNames(c('Response', 'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE',
'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE'))
validation_result_table %>%
knitr::kable(format = 'pipe', digits = 3, table.attr = "style='width:100%;'") %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 1, 'No Transformation' = 4, 'Log-Transformation' = 4)) %>%
kable_styling()
| Response | Coef | R2 | Normal RMSE | Disaster RMSE | Coef | R2 | Normal RMSE | Disaster RMSE |
|---|---|---|---|---|---|---|---|---|
| Since Feb 2022 | ||||||||
| Daily Avg | 1.012 | 0.759 | 1.019 | 0.814 | ||||
| Daily Max | 0.743 | 0.207 | 0.992 | 0.614 | ||||
| Hourly Avg | 1.011 | 0.533 | 1.028 | 0.698 | ||||
| Hourly Max | 1.033 | 0.365 | 1.025 | 0.638 | ||||
| Disaster Only | ||||||||
| Daily Avg | 1.172 | 0.642 | 1.015 | 0.864 | ||||
| Daily Max | 1.041 | 0.532 | 1.020 | 0.821 | ||||
| Hourly Avg | 0.925 | 0.434 | 1.002 | 0.771 | ||||
| Hourly Max | 0.894 | 0.485 | 1.000 | 0.750 | ||||
| Exclude Disaster | ||||||||
| Daily Avg | 1.004 | 0.673 | 1.025 | 0.792 | ||||
| Daily Max | 0.793 | 0.137 | 0.984 | 0.608 | ||||
| Hourly Avg | 1.033 | 0.403 | 1.047 | 0.653 | ||||
| Hourly Max | 0.965 | 0.276 | 1.043 | 0.604 | ||||
| Everything w D.I | ||||||||
| Daily Avg | 1.063 | 0.970 | 0.477 | 15.828 | 1.009 | 0.977 | 0.121 | 0.135 |
| Daily Max | 0.995 | 0.997 | 1.263 | 36.075 | 1.008 | 0.961 | 0.202 | 0.205 |
| Hourly Avg | 1.013 | 0.940 | 1.686 | 32.007 | 1.043 | 0.680 | 0.496 | 0.669 |
| Hourly Max | 1.042 | 0.915 | 4.313 | 81.977 | 1.043 | 0.642 | 0.567 | 0.719 |
| Everything w.o D.I | ||||||||
| Daily Avg | 1.049 | 0.989 | 0.137 | 9.919 | 1.011 | 0.975 | 0.121 | 0.198 |
| Daily Max | 0.990 | 0.994 | 5.095 | 50.451 | 1.010 | 0.958 | 0.212 | 0.197 |
| Hourly Avg | 1.030 | 0.958 | 2.041 | 26.420 | 1.045 | 0.679 | 0.496 | 0.674 |
| Hourly Max | 0.948 | 0.874 | 4.301 | 100.490 | 1.046 | 0.641 | 0.567 | 0.728 |
full_result_table_fordisp <- base_table %>%
left_join(log_table %>% select(-n), join_by(date_names, response_base)) %>%
left_join(base_xgb_result, join_by(date_names == Model, response_base)) %>%
left_join(log_xgb_result, join_by(date_names == Model, response_base)) %>%
select(-starts_with('model_response_name'), -starts_with('daterange'), -'date_names') %>%
select(all_of(c('response_base', 'n', 'adjr2.x', 'p.x', 'adjr2.y', 'bt_adjr2', 'p.y', 'R-Sq.x', 'RMSE.x', 'R-Sq.y', 'RMSE.y', 'BT R-Sq', 'BT RMSE'))) %>%
setNames(c('Response','N', 'R2', 'P', 'R2', 'BT R2', 'P',
c('R2', 'RMSE'),
c('R2', 'RMSE', 'BT R2', 'BT RMSE')))
full_result_table_kable <- full_result_table_fordisp %>%
knitr::kable(format = 'latex', digits = 2) %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6))
writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')
full_result_table_fordisp %>%
knitr::kable(format = 'pipe', digits = 4, table.attr = "style='width:100%;'") %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6)) %>%
kable_styling()
| Response | N | R2 | P | R2 | BT R2 | P | R2 | RMSE | R2 | RMSE | BT R2 | BT RMSE |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Since Feb 2022 | ||||||||||||
| Daily Avg | 6531 | 0.6175 | 117 | 0.6979 | 0.6132 | 116 | 0.7614 | 0.2478 | 0.8127 | 0.3059 | 0.7637 | 0.2478 |
| Daily Max | 6531 | 0.2186 | 116 | 0.5433 | 0.2190 | 115 | 0.2879 | 3.1244 | 0.6123 | 0.5722 | 0.3557 | 2.9330 |
| Hourly Avg | 153718 | 0.3515 | 123 | 0.5466 | 0.3400 | 122 | 0.5345 | 0.4892 | 0.6975 | 0.4493 | 0.5042 | 0.5114 |
| Hourly Max | 153718 | 0.1748 | 122 | 0.4954 | 0.1608 | 122 | 0.3769 | 1.0293 | 0.6383 | 0.5400 | 0.2907 | 1.1047 |
| Disaster Only | ||||||||||||
| Daily Avg | 1273 | 0.4170 | 105 | 0.7458 | 0.5459 | 111 | 0.5257 | 40.7997 | 0.8532 | 0.4274 | 0.8037 | 30.9226 |
| Daily Max | 1273 | 0.4320 | 99 | 0.7246 | 0.4399 | 110 | 0.7240 | 360.8105 | 0.8146 | 0.6128 | 0.7604 | 296.5671 |
| Hourly Avg | 30242 | 0.2097 | 114 | 0.5830 | 0.2170 | 114 | 0.4512 | 96.7972 | 0.7705 | 0.6075 | 0.4515 | 97.9899 |
| Hourly Max | 30242 | 0.2229 | 114 | 0.5601 | 0.2360 | 114 | 0.5473 | 196.2375 | 0.7492 | 0.6631 | 0.4520 | 213.3409 |
| Exclude Disaster | ||||||||||||
| Daily Avg | 14322 | 0.4586 | 120 | 0.5666 | 0.4598 | 121 | 0.7043 | 0.3188 | 0.7915 | 0.3326 | 0.6951 | 0.3258 |
| Daily Max | 14322 | 0.1231 | 115 | 0.4800 | 0.1072 | 120 | 0.2722 | 3.4251 | 0.6073 | 0.5725 | 0.3417 | 3.2564 |
| Hourly Avg | 337596 | 0.2481 | 124 | 0.4442 | 0.2499 | 125 | 0.4896 | 0.6074 | 0.6526 | 0.4940 | 0.4561 | 0.6354 |
| Hourly Max | 337596 | 0.1497 | 123 | 0.4112 | 0.1413 | 125 | 0.3360 | 1.1044 | 0.6043 | 0.5672 | 0.2918 | 1.1416 |
| Everything w D.I | ||||||||||||
| Daily Avg | 15595 | 0.1137 | 115 | 0.5588 | -0.0072 | 120 | 0.9689 | 3.0545 | 0.9775 | 0.1219 | 0.9812 | 1.6211 |
| Daily Max | 15595 | 0.1236 | 115 | 0.5080 | -0.0073 | 120 | 0.9628 | 6.7763 | 0.9609 | 0.2016 | 0.9980 | 4.3671 |
| Hourly Avg | 367838 | 0.0486 | 126 | 0.4428 | 0.0023 | 125 | 0.9322 | 8.6180 | 0.6801 | 0.5121 | 0.4724 | 29.6837 |
| Hourly Max | 367838 | 0.0505 | 125 | 0.4177 | 0.0010 | 126 | 0.9223 | 21.4359 | 0.6415 | 0.5812 | 0.4736 | 62.7187 |
| Everything w.o D.I | ||||||||||||
| Daily Avg | 15595 | 0.1108 | 114 | 0.5431 | -0.0073 | 119 | 0.9968 | 1.1273 | 0.9747 | 0.1289 | 0.9073 | 4.5043 |
| Daily Max | 15595 | 0.1206 | 115 | 0.4907 | -0.0074 | 120 | 0.9797 | 8.7597 | 0.9574 | 0.2098 | 0.9966 | 7.8280 |
| Hourly Avg | 367838 | 0.0475 | 125 | 0.4353 | 0.0011 | 124 | 0.9491 | 7.5139 | 0.6790 | 0.5131 | 0.4320 | 29.1288 |
| Hourly Max | 367838 | 0.0495 | 125 | 0.4095 | 0.0004 | 125 | 0.8837 | 25.5470 | 0.6408 | 0.5819 | 0.4550 | 63.2933 |